diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h b/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h index cd4fdfa0e9e37..c1bdc68d808c7 100644 --- a/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h +++ b/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h @@ -106,6 +106,15 @@ class GsymReader { /// address. llvm::Expected getFunctionInfo(uint64_t Addr) const; + /// Get the full function info given an address index. + /// + /// \param AddrIdx A address index for an address in the address table. + /// + /// \returns An expected FunctionInfo that contains the function info object + /// or an error object that indicates reason for failing get the function + /// info object. + llvm::Expected getFunctionInfoAtIndex(uint64_t AddrIdx) const; + /// Lookup an address in the a GSYM. /// /// Lookup just the information needed for a specific address \a Addr. This @@ -266,6 +275,19 @@ class GsymReader { return std::nullopt; if (Iter == End || AddrOffset < *Iter) --Iter; + + // GSYM files have sorted function infos with the most information (line + // table and/or inline info) first in the array of function infos, so + // always backup as much as possible as long as the address offset is the + // same as the previous entry. + while (Iter != Begin) { + auto Prev = Iter - 1; + if (*Prev == *Iter) + Iter = Prev; + else + break; + } + return std::distance(Begin, Iter); } @@ -303,6 +325,38 @@ class GsymReader { /// \returns An optional GSYM data offset for the offset of the FunctionInfo /// that needs to be decoded. std::optional getAddressInfoOffset(size_t Index) const; + + /// Given an address, find the correct function info data and function + /// address. + /// + /// Binary search the address table and find the matching address info + /// and make sure that the function info contains the address. GSYM allows + /// functions to overlap, and the most debug info is contained in the first + /// entries due to the sorting when GSYM files are created. We can have + /// multiple function info that start at the same address only if their + /// address range doesn't match. So find the first entry that matches \a Addr + /// and iterate forward until we find one that contains the address. + /// + /// \param[in] Addr A virtual address that matches the original object file + /// to lookup. + /// + /// \param[out] FuncStartAddr A virtual address that is the base address of + /// the function that is used for decoding the FunctionInfo. + /// + /// \returns An valid data extractor on success, or an error if we fail to + /// find the address in a function info or corrrectly decode the data + llvm::Expected + getFunctionInfoDataForAddress(uint64_t Addr, uint64_t &FuncStartAddr) const; + + /// Get the function data and address given an address index. + /// + /// \param AddrIdx A address index from the address table. + /// + /// \returns An expected FunctionInfo that contains the function info object + /// or an error object that indicates reason for failing to lookup the + /// address. + llvm::Expected + getFunctionInfoDataAtIndex(uint64_t AddrIdx, uint64_t &FuncStartAddr) const; }; } // namespace gsym diff --git a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp index 1fe90ef579a3d..4b1b352466175 100644 --- a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp +++ b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp @@ -253,49 +253,94 @@ GsymReader::getAddressIndex(const uint64_t Addr) const { } -llvm::Expected GsymReader::getFunctionInfo(uint64_t Addr) const { - Expected AddressIndex = getAddressIndex(Addr); - if (!AddressIndex) - return AddressIndex.takeError(); - // Address info offsets size should have been checked in parse(). - assert(*AddressIndex < AddrInfoOffsets.size()); - auto AddrInfoOffset = AddrInfoOffsets[*AddressIndex]; - assert( - (Endian == llvm::endianness::big || Endian == llvm::endianness::little) && - "Endian must be either big or little"); - DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset), - Endian == llvm::endianness::little, 4); - if (std::optional OptAddr = getAddress(*AddressIndex)) { - auto ExpectedFI = FunctionInfo::decode(Data, *OptAddr); - if (ExpectedFI) { - if (ExpectedFI->Range.contains(Addr) || ExpectedFI->Range.size() == 0) - return ExpectedFI; - return createStringError(std::errc::invalid_argument, - "address 0x%" PRIx64 " is not in GSYM", Addr); +llvm::Expected +GsymReader::getFunctionInfoDataForAddress(uint64_t Addr, + uint64_t &FuncStartAddr) const { + Expected ExpectedAddrIdx = getAddressIndex(Addr); + if (!ExpectedAddrIdx) + return ExpectedAddrIdx.takeError(); + const uint64_t FirstAddrIdx = *ExpectedAddrIdx; + // The AddrIdx is the first index of the function info entries that match + // \a Addr. We need to iterate over all function info objects that start with + // the same address until we find a range that contains \a Addr. + std::optional FirstFuncStartAddr; + const size_t NumAddresses = getNumAddresses(); + for (uint64_t AddrIdx = FirstAddrIdx; AddrIdx < NumAddresses; ++AddrIdx) { + auto ExpextedData = getFunctionInfoDataAtIndex(AddrIdx, FuncStartAddr); + // If there was an error, return the error. + if (!ExpextedData) + return ExpextedData; + + // Remember the first function start address if it hasn't already been set. + // If it is already valid, check to see if it matches the first function + // start address and only continue if it matches. + if (FirstFuncStartAddr.has_value()) { + if (*FirstFuncStartAddr != FuncStartAddr) + break; // Done with consecutive function entries with same address. + } else { + FirstFuncStartAddr = FuncStartAddr; } + // Make sure the current function address ranges contains \a Addr. + // Some symbols on Darwin don't have valid sizes, so if we run into a + // symbol with zero size, then we have found a match for our address. + + // The first thing the encoding of a FunctionInfo object is the function + // size. + uint64_t Offset = 0; + uint32_t FuncSize = ExpextedData->getU32(&Offset); + if (FuncSize == 0 || + AddressRange(FuncStartAddr, FuncStartAddr + FuncSize).contains(Addr)) + return ExpextedData; } return createStringError(std::errc::invalid_argument, - "failed to extract address[%" PRIu64 "]", - *AddressIndex); + "address 0x%" PRIx64 " is not in GSYM", Addr); +} + +llvm::Expected +GsymReader::getFunctionInfoDataAtIndex(uint64_t AddrIdx, + uint64_t &FuncStartAddr) const { + if (AddrIdx >= getNumAddresses()) + return createStringError(std::errc::invalid_argument, + "invalid address index %" PRIu64, AddrIdx); + const uint32_t AddrInfoOffset = AddrInfoOffsets[AddrIdx]; + assert((Endian == endianness::big || Endian == endianness::little) && + "Endian must be either big or little"); + StringRef Bytes = MemBuffer->getBuffer().substr(AddrInfoOffset); + if (Bytes.empty()) + return createStringError(std::errc::invalid_argument, + "invalid address info offset 0x%" PRIx32, + AddrInfoOffset); + std::optional OptFuncStartAddr = getAddress(AddrIdx); + if (!OptFuncStartAddr) + return createStringError(std::errc::invalid_argument, + "failed to extract address[%" PRIu64 "]", AddrIdx); + FuncStartAddr = *OptFuncStartAddr; + return DataExtractor(Bytes, Endian == llvm::endianness::little, 4); +} + +llvm::Expected GsymReader::getFunctionInfo(uint64_t Addr) const { + uint64_t FuncStartAddr = 0; + if (auto ExpectedData = getFunctionInfoDataForAddress(Addr, FuncStartAddr)) + return FunctionInfo::decode(*ExpectedData, FuncStartAddr); + else + return ExpectedData.takeError(); +} + +llvm::Expected +GsymReader::getFunctionInfoAtIndex(uint64_t Idx) const { + uint64_t FuncStartAddr = 0; + if (auto ExpectedData = getFunctionInfoDataAtIndex(Idx, FuncStartAddr)) + return FunctionInfo::decode(*ExpectedData, FuncStartAddr); + else + return ExpectedData.takeError(); } llvm::Expected GsymReader::lookup(uint64_t Addr) const { - Expected AddressIndex = getAddressIndex(Addr); - if (!AddressIndex) - return AddressIndex.takeError(); - // Address info offsets size should have been checked in parse(). - assert(*AddressIndex < AddrInfoOffsets.size()); - auto AddrInfoOffset = AddrInfoOffsets[*AddressIndex]; - assert( - (Endian == llvm::endianness::big || Endian == llvm::endianness::little) && - "Endian must be either big or little"); - DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset), - Endian == llvm::endianness::little, 4); - if (std::optional OptAddr = getAddress(*AddressIndex)) - return FunctionInfo::lookup(Data, *this, *OptAddr, Addr); - return createStringError(std::errc::invalid_argument, - "failed to extract address[%" PRIu64 "]", - *AddressIndex); + uint64_t FuncStartAddr = 0; + if (auto ExpectedData = getFunctionInfoDataForAddress(Addr, FuncStartAddr)) + return FunctionInfo::lookup(*ExpectedData, *this, FuncStartAddr, Addr); + else + return ExpectedData.takeError(); } void GsymReader::dump(raw_ostream &OS) { @@ -346,7 +391,7 @@ void GsymReader::dump(raw_ostream &OS) { for (uint32_t I = 0; I < Header.NumAddresses; ++I) { OS << "FunctionInfo @ " << HEX32(AddrInfoOffsets[I]) << ": "; - if (auto FI = getFunctionInfo(*getAddress(I))) + if (auto FI = getFunctionInfoAtIndex(I)) dump(OS, *FI); else logAllUnhandledErrors(FI.takeError(), OS, "FunctionInfo:"); diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp index 53de96cc6953c..f7ee7693ac2ee 100644 --- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp +++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp @@ -4681,3 +4681,200 @@ TEST(GSYMTest, TestHandlingOfInvalidFileIndexes) { for (const auto &Error : ExpectedLogErrors) EXPECT_TRUE(errors.find(Error) != std::string::npos); } + +TEST(GSYMTest, TestLookupsOfOverlappingAndUnequalRanges) { + // Test that llvm-gsymutil lookup the correct funtion info when address + // ranges overlap. When functions overlap we always want to pick the first + // function info when symbolicating if there are multiple entries with the + // same address. Previous to this fix we would just binary search the address + // table and pick the first function info that matched the address. After + // this fix we now always select the first matching entry whose address range + // contains the lookup address to ensure we have the most debug info. We have + // seen case where the debug info would contain a small range and a symbol + // would have the same start address but the range was larger and sometimes, + // depending on how the binary search of the address table happened, we would + // pick these latter entries. We want the first entries because they always + // have the most debug info. + // + // To repro this case, we just make some simple DWARF that has two + // overlapping ranges and ensure that any lookups between 0x1000 and 0x104f + // match "foo", and any ranges between 0x1050 and 0x1fff match "bar". + // + // 0x0000000b: DW_TAG_compile_unit + // DW_AT_name ("/tmp/main.cpp") + // DW_AT_language (DW_LANG_C) + // DW_AT_stmt_list (0x00000000) + // + // 0x00000015: DW_TAG_subprogram + // DW_AT_name ("foo") + // DW_AT_low_pc (0x0000000000001000) + // DW_AT_high_pc (0x0000000000001050) + // + // 0x0000002a: DW_TAG_subprogram + // DW_AT_name ("bar") + // DW_AT_low_pc (0x0000000000001000) + // DW_AT_high_pc (0x0000000000001100) + // + // 0x0000003f: NULL + + StringRef yamldata = R"( + debug_str: + - '' + - '/tmp/main.cpp' + - foo + - bar + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_language + Form: DW_FORM_udata + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Length: 0x3C + Version: 4 + AbbrevTableID: 0 + AbbrOffset: 0x0 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + Values: + - Value: 0x1 + - Value: 0x2 + - Value: 0x0 + - AbbrCode: 0x2 + Values: + - Value: 0xF + - Value: 0x1000 + - Value: 0x1050 + - AbbrCode: 0x2 + Values: + - Value: 0x13 + - Value: 0x1000 + - Value: 0x1100 + - AbbrCode: 0x0 + debug_line: + - Length: 71 + Version: 2 + PrologueLength: 36 + MinInstLength: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + IncludeDirs: + - '/tmp' + Files: + - Name: main.cpp + DirIdx: 1 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 4096 + - Opcode: DW_LNS_advance_line + SData: 9 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_pc + Data: 16 + - Opcode: DW_LNS_advance_line + SData: 1 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_line + SData: 1 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_pc + Data: 64 + - Opcode: DW_LNS_advance_line + SData: 1 + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + )"; + auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); + ASSERT_THAT_EXPECTED(ErrOrSections, Succeeded()); + std::unique_ptr DwarfContext = + DWARFContext::create(*ErrOrSections, 8); + ASSERT_TRUE(DwarfContext.get() != nullptr); + std::string errors; + raw_string_ostream OS(errors); + GsymCreator GC; + DwarfTransformer DT(*DwarfContext, GC); + const uint32_t ThreadCount = 1; + ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded()); + ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded()); + OS.flush(); + SmallString<512> Str; + raw_svector_ostream OutStrm(Str); + const auto ByteOrder = llvm::endianness::native; + FileWriter FW(OutStrm, ByteOrder); + ASSERT_THAT_ERROR(GC.encode(FW), Succeeded()); + Expected GR = GsymReader::copyBuffer(OutStrm.str()); + ASSERT_THAT_EXPECTED(GR, Succeeded()); + // There should be two functions in our GSYM. + EXPECT_EQ(GR->getNumAddresses(), 2u); + // Verify "foo" is correctly looked up for each of its addresses. + for (uint64_t Addr = 0x1000; Addr < 0x1050; ++Addr) { + auto ExpFI = GR->getFunctionInfo(Addr); + ASSERT_THAT_EXPECTED(ExpFI, Succeeded()); + ASSERT_EQ(ExpFI->Range, AddressRange(0x1000, 0x1050)); + StringRef FuncName = GR->getString(ExpFI->Name); + EXPECT_EQ(FuncName, "foo"); + } + + // Verify "bar" is correctly looked up for each of its addresses. + for (uint64_t Addr = 0x1050; Addr < 0x1100; ++Addr) { + auto ExpFI = GR->getFunctionInfo(Addr); + ASSERT_THAT_EXPECTED(ExpFI, Succeeded()); + ASSERT_EQ(ExpFI->Range, AddressRange(0x1000, 0x1100)); + StringRef FuncName = GR->getString(ExpFI->Name); + EXPECT_EQ(FuncName, "bar"); + } + + // Prior to the fix for this issue when we dumped an entire GSYM file, we + // were using a function that would extract a FunctionInfo object for a + // given address which caused us to always dump the first FunctionInfo + // entry for a given address. We now dump it correctly using an address + // index. Below we verify that we dump the right FunctionInfo gets dumped. + + SmallString<512> DumpStr; + raw_svector_ostream DumpStrm(DumpStr); + GR->dump(DumpStrm); + + // Make sure we see both "foo" and "bar" in the output of an entire GSYM + // dump. Prior to this fix we would two "foo" entries. + std::vector ExpectedDumpLines = { + "@ 0x00000068: [0x0000000000001000 - 0x0000000000001050) \"foo\"", + "@ 0x00000088: [0x0000000000001000 - 0x0000000000001100) \"bar\""}; + // Make sure all expected errors are in the error stream for the two invalid + // inlined functions that we removed due to invalid range scoping. + for (const auto &Line : ExpectedDumpLines) + EXPECT_TRUE(DumpStr.find(Line) != std::string::npos); +}