From 87e7ec3365d741111b50c1fb63bbf502332dc9b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Rodr=C3=ADguez?= Date: Thu, 9 Oct 2025 17:24:56 -0700 Subject: [PATCH 1/3] [llvm-nm] Improve performance while faking symbols from function starts By default `nm` will look into `LC_FUNCTION_STARTS` for binaries that have the flag `MH_NLIST_OUTOFSYNC_WITH_DYLDINFO` set unless `--no-dyldinfo` flag is passed. The implementation that looked for those `LC_FUNCTION_STARTS` in the symbol list was a double nested loop that checked the symbol list over and over again for each of the `LC_FUNCTION_STARTS` entries. For binaries with couple million function starts and hundreds of thousands of symbols, the double nested loop doesn't seem to finish and takes hours even in powerful machines. Instead of the nested loop, exchange time for memory and add all the addresses of the symbols into a set that can be checked then for each of the `LC_FUNCTION_STARTS` very quickly. What took hours and hours and did not seem to finish now takes less than 10 seconds. Fixes #93944 --- llvm/tools/llvm-nm/llvm-nm.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp index ff07fbbaa5351..2cc8faeae6e48 100644 --- a/llvm/tools/llvm-nm/llvm-nm.cpp +++ b/llvm/tools/llvm-nm/llvm-nm.cpp @@ -15,6 +15,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/MachO.h" @@ -1615,12 +1616,11 @@ static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO, } // See if these addresses are already in the symbol table. unsigned FunctionStartsAdded = 0; + SmallSet SymbolAddresses; + for (unsigned J = 0; J < SymbolList.size(); ++J) + SymbolAddresses.insert(SymbolList[J].Address); for (uint64_t f = 0; f < FoundFns.size(); f++) { - bool found = false; - for (unsigned J = 0; J < SymbolList.size() && !found; ++J) { - if (SymbolList[J].Address == FoundFns[f] + BaseSegmentAddress) - found = true; - } + bool found = SymbolAddresses.contains(FoundFns[f] + BaseSegmentAddress); // See this address is not already in the symbol table fake up an // nlist for it. if (!found) { From 6201874d459247701764ac294dd2ea91f40f53d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Rodr=C3=ADguez?= Date: Fri, 10 Oct 2025 11:42:43 -0700 Subject: [PATCH 2/3] Apply feedback - Inline `found` boolean. - Improve comment wording - Use foreach loop instead of classic index loop. --- llvm/tools/llvm-nm/llvm-nm.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp index 2cc8faeae6e48..968cfe1d2da57 100644 --- a/llvm/tools/llvm-nm/llvm-nm.cpp +++ b/llvm/tools/llvm-nm/llvm-nm.cpp @@ -1617,13 +1617,12 @@ static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO, // See if these addresses are already in the symbol table. unsigned FunctionStartsAdded = 0; SmallSet SymbolAddresses; - for (unsigned J = 0; J < SymbolList.size(); ++J) - SymbolAddresses.insert(SymbolList[J].Address); + for (const auto &S : SymbolList) + SymbolAddresses.insert(S.Address); for (uint64_t f = 0; f < FoundFns.size(); f++) { - bool found = SymbolAddresses.contains(FoundFns[f] + BaseSegmentAddress); - // See this address is not already in the symbol table fake up an - // nlist for it. - if (!found) { + // See if this address is already in the symbol table, otherwise fake up + // an nlist for it. + if (!SymbolAddresses.contains(FoundFns[f] + BaseSegmentAddress)) { NMSymbol F = {}; F.Name = ""; F.Address = FoundFns[f] + BaseSegmentAddress; From 65815a4fc91d5626d38a2fd3b6990903e3237d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Rodr=C3=ADguez?= Date: Fri, 10 Oct 2025 16:22:52 -0700 Subject: [PATCH 3/3] Add explanatory comment about relation between FoundFns and SymbolList addresses --- llvm/tools/llvm-nm/llvm-nm.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp index 968cfe1d2da57..dcfc0f92590d5 100644 --- a/llvm/tools/llvm-nm/llvm-nm.cpp +++ b/llvm/tools/llvm-nm/llvm-nm.cpp @@ -1616,6 +1616,11 @@ static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO, } // See if these addresses are already in the symbol table. unsigned FunctionStartsAdded = 0; + // The addresses from FoundFns come from LC_FUNCTION_STARTS. Its contents + // are delta encoded addresses from the start of __TEXT, ending when zero + // is found. Because of this, the addresses should be unique, and even if + // we create fake entries on SymbolList in the second loop, SymbolAddresses + // should not need to be updated there. SmallSet SymbolAddresses; for (const auto &S : SymbolList) SymbolAddresses.insert(S.Address);