[Support] Add decodeULEB128AndInc/decodeSLEB128AndInc #85739

MaskRay · 2024-03-19T06:18:14Z

Many decodeULEB128/decodeSLEB128 users need to increment the pointer.
Add helpers to simplify this common pattern. We don't add end and
error parameters at present because many users don't need them.

Created using spr 1.3.5-bogner

llvmbot · 2024-03-19T06:18:48Z

@llvm/pr-subscribers-llvm-support

@llvm/pr-subscribers-objectyaml

Author: Fangrui Song (MaskRay)

Changes

Many decodeULEB128/decodeSLEB128 users need to increment the pointer.
Add helpers to simplify this common pattern. We don't add end and
error parameters at present because many users don't need them.

Full diff: https://github.com/llvm/llvm-project/pull/85739.diff

4 Files Affected:

(modified) llvm/include/llvm/Support/LEB128.h (+14)
(modified) llvm/tools/obj2yaml/macho2yaml.cpp (+3-7)
(modified) llvm/unittests/Support/LEB128Test.cpp (+15)
(modified) llvm/utils/TableGen/DecoderEmitter.cpp (+10-23)

diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index 7fc572b6ff06ef..c4e741549f3ff1 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -200,6 +200,20 @@ inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr,
   return Value;
 }
 
+inline uint64_t decodeULEB128AndInc(const uint8_t *&p) {
+  unsigned n;
+  auto ret = decodeULEB128(p, &n);
+  p += n;
+  return ret;
+}
+
+inline int64_t decodeSLEB128AndInc(const uint8_t *&p) {
+  unsigned n;
+  auto ret = decodeSLEB128(p, &n);
+  p += n;
+  return ret;
+}
+
 /// Utility function to get the size of the ULEB128-encoded value.
 extern unsigned getULEB128Size(uint64_t Value);
 
diff --git a/llvm/tools/obj2yaml/macho2yaml.cpp b/llvm/tools/obj2yaml/macho2yaml.cpp
index cdd871e8c1d684..d4a8c092a083f4 100644
--- a/llvm/tools/obj2yaml/macho2yaml.cpp
+++ b/llvm/tools/obj2yaml/macho2yaml.cpp
@@ -427,15 +427,13 @@ void MachODumper::dumpBindOpcodes(
         static_cast<MachO::BindOpcode>(*OpCode & MachO::BIND_OPCODE_MASK);
     BindOp.Imm = *OpCode & MachO::BIND_IMMEDIATE_MASK;
 
-    unsigned Count;
     uint64_t ULEB = 0;
     int64_t SLEB = 0;
 
     switch (BindOp.Opcode) {
     case MachO::BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB:
-      ULEB = decodeULEB128(OpCode + 1, &Count);
+      ULEB = decodeULEB128AndInc(++OpCode);
       BindOp.ULEBExtraData.push_back(ULEB);
-      OpCode += Count;
       [[fallthrough]];
     // Intentionally no break here -- this opcode has two ULEB values
 
@@ -443,15 +441,13 @@ void MachODumper::dumpBindOpcodes(
     case MachO::BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
     case MachO::BIND_OPCODE_ADD_ADDR_ULEB:
     case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB:
-      ULEB = decodeULEB128(OpCode + 1, &Count);
+      ULEB = decodeULEB128AndInc(++OpCode);
       BindOp.ULEBExtraData.push_back(ULEB);
-      OpCode += Count;
       break;
 
     case MachO::BIND_OPCODE_SET_ADDEND_SLEB:
-      SLEB = decodeSLEB128(OpCode + 1, &Count);
+      SLEB = decodeSLEB128AndInc(++OpCode);
       BindOp.SLEBExtraData.push_back(SLEB);
-      OpCode += Count;
       break;
 
     case MachO::BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM:
diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp
index 21523e5f7a08c7..08b8c5573ce637 100644
--- a/llvm/unittests/Support/LEB128Test.cpp
+++ b/llvm/unittests/Support/LEB128Test.cpp
@@ -242,6 +242,21 @@ TEST(LEB128Test, DecodeInvalidSLEB128) {
 #undef EXPECT_INVALID_SLEB128
 }
 
+TEST(LEB128Test, DecodeAndInc) {
+#define EXPECT_LEB128(FUN, VALUE, SIZE)                                        \
+  do {                                                                         \
+    const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE), *P = V;       \
+    auto Expected = FUN(P), Actual = FUN##AndInc(P);                           \
+    EXPECT_EQ(Actual, Expected);                                               \
+    EXPECT_EQ(P - V, SIZE);                                                    \
+  } while (0)
+  EXPECT_LEB128(decodeULEB128, "\x7f", 1);
+  EXPECT_LEB128(decodeULEB128, "\x80\x01", 2);
+  EXPECT_LEB128(decodeSLEB128, "\x7f", 1);
+  EXPECT_LEB128(decodeSLEB128, "\x80\x01", 2);
+#undef EXPECT_LEB128
+}
+
 TEST(LEB128Test, SLEB128Size) {
   // Positive Value Testing Plan:
   // (1) 128 ^ n - 1 ........ need (n+1) bytes
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 628bff520a12e9..732f34ed04c577 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2283,10 +2283,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_CheckField: {
       // Decode the start value.
-      unsigned Len;
-      unsigned Start = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      Len = *Ptr;)";
+      unsigned Start = decodeULEB128AndInc(++Ptr);
+      unsigned Len = *Ptr;)";
   if (IsVarLenInst)
     OS << "\n      makeUp(insn, Start + Len);";
   OS << R"(
@@ -2311,10 +2309,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       break;
     }
     case MCD::OPC_CheckPredicate: {
-      unsigned Len;
       // Decode the Predicate Index value.
-      unsigned PIdx = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
+      unsigned PIdx = decodeULEB128AndInc(++Ptr);
       // NumToSkip is a plain 24-bit integer.
       unsigned NumToSkip = *Ptr++;
       NumToSkip |= (*Ptr++) << 8;
@@ -2330,18 +2326,15 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       break;
     }
     case MCD::OPC_Decode: {
-      unsigned Len;
       // Decode the Opcode value.
-      unsigned Opc = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      unsigned Opc = decodeULEB128AndInc(++Ptr);
+      unsigned DecodeIdx = decodeULEB128AndInc(Ptr);
 
       MI.clear();
       MI.setOpcode(Opc);
       bool DecodeComplete;)";
   if (IsVarLenInst) {
-    OS << "\n      Len = InstrLenTable[Opc];\n"
+    OS << "\n      unsigned Len = InstrLenTable[Opc];\n"
        << "      makeUp(insn, Len);";
   }
   OS << R"(
@@ -2354,12 +2347,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       return S;
     }
     case MCD::OPC_TryDecode: {
-      unsigned Len;
       // Decode the Opcode value.
-      unsigned Opc = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      unsigned Opc = decodeULEB128AndInc(++Ptr);
+      unsigned DecodeIdx = decodeULEB128AndInc(Ptr);
       // NumToSkip is a plain 24-bit integer.
       unsigned NumToSkip = *Ptr++;
       NumToSkip |= (*Ptr++) << 8;
@@ -2391,11 +2381,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_SoftFail: {
       // Decode the mask values.
-      unsigned Len;
-      uint64_t PositiveMask = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      uint64_t NegativeMask = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      uint64_t PositiveMask = decodeULEB128AndInc(++Ptr);
+      uint64_t NegativeMask = decodeULEB128AndInc(Ptr);
       bool Fail = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
       if (Fail)
         S = MCDisassembler::SoftFail;

adrian-prantl

I've been toying with the idea of moving the decoder function into the .cpp file, and maybe having two entry points for error / no errors to further speed up decoding, and this seems to get us closer to that goal.

Created using spr 1.3.5-bogner

dwblaikie · 2024-03-21T16:28:54Z

The lack of error handling and bounds checking in these versions seems a bit problematic? (like, I'm not sure they can be used safely/correctly?)

adrian-prantl · 2024-03-21T17:16:00Z

I was assuming that the code that uses these functions does a bounds safety check for the entire section. @MaskRay ?

MaskRay · 2024-03-22T02:22:01Z

The tablegen use case guarantees that there is no error. The Shift>63 check is pure overhead.
Use cases that require rigid error checking can probably use llvm/include/llvm/Support/DataExtractor.h getULEB128, but the overhead would be substantial.
At any rate, the error parameters could be added to *AndInc but I have found a good use case yet.

For efficient ULEB128 for the primary use cases (debug info, etc), perhaps we should unroll the first iteration.

Many decodeULEB128/decodeSLEB128 users need to increment the pointer. Add helpers to simplify this common pattern. We don't add `end` and `error` parameters at present because many users don't need them. Pull Request: llvm#85739

dwblaikie · 2024-03-25T17:14:05Z

The tablegen use case guarantees that there is no error. The Shift>63 check is pure overhead. Use cases that require rigid error checking can probably use llvm/include/llvm/Support/DataExtractor.h getULEB128, but the overhead would be substantial. At any rate, the error parameters could be added to *AndInc but I have found a good use case yet.

I was assuming that the code that uses these functions does a bounds safety check for the entire section. @MaskRay ?

But it's not possible to bounds check ULEB reading, right? Non-canonical ULEBs can be arbitrarily long (null padding) - so no untrusted/input buffer can be sure not to buffer overrun, I think? (looking at the implementation of decodeULEB128 - if Slice is zero, but the high bit is set, you keep reading the uleb, potentially overrunning any finite buffer)

So this is only safe to call on buffers with known contents? Which is special cases like tblgen, where the buffer is baked into the binary. Anything reading a uleb from unknown input could not use this API correctly.

I'd think it'd be best to always require error checking, despite the few cases where that's unnecessary.

Looks to me like a lot of the callers of the previous/existing/underlying API are unsafe. Tblgen's safety seems like it'd be in the minority of callers.

MaskRay · 2024-03-25T20:07:56Z

I agree that some LEB128 uses do pay less attention about potential buffer overrun.

Like the one mentioned in https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=SECURITY.txt

Compiling untrusted sources can result in arbitrary code execution and unconstrained resource consumption in the compiler. As a result, compilation of such code should be done inside a sandboxed environment to ensure that it does not compromise the host environment.

Many of LLVM tools might prioritize efficiency over bounds checking (E.g. I believe that in a lot of llvm/lib/Object/ places do not check the bounds.)
We might not have the bandwidth to address every potential LEB128 decode call site with explicit end arguments or by switching entirely to DataExtractor.

dwblaikie · 2024-03-28T18:53:08Z

I agree that some LEB128 uses do pay less attention about potential buffer overrun.

Like the one mentioned in https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=SECURITY.txt

Compiling untrusted sources can result in arbitrary code execution and unconstrained resource consumption in the compiler. As a result, compilation of such code should be done inside a sandboxed environment to ensure that it does not compromise the host environment.

Many of LLVM tools might prioritize efficiency over bounds checking (E.g. I believe that in a lot of llvm/lib/Object/ places do not check the bounds.)

I don't think that's what they're doing - my understanding from the Apple folks (& part of the reason @lhames invented the llvm::Error stuff) is that they're quite interested in having, and have invested in it quite a bit, libObject being robust to corrupted/arbitrary inputs - for better investigation into problematic object files.

If they lack bounds checking it's because they were written pretty casually to begin with - which is often the case, and unfortunate (I can say the LLVM debug info APIs are similarly pretty casual). Not out of a performance concern (performance without correctness is... not usually what anyone wants) but just because folks weren't thinking about/prioritizing bad input robustness.

We might not have the bandwidth to address every potential LEB128 decode call site with explicit end arguments or by switching entirely to DataExtractor.

I'm not suggesting we can/will/should prioritize paying down all the places we would crash on untrusted input - I think we might've done something differently if we'd thought about it from day 1 (adding fuzzers, etc) - much easier/cheaper to do on new code than to justify the cost of doing the cleanup after the fact.

But that's why I'm concerned about creating APIs that are hard to use safely. The previous/existing APIs that provide optional bounds checking are already pretty easy to misuse (they don't really push folks to being careful with these things). I think it's important to lower the cost of writing bounds safe code - and having APIs that don't provide a way to do it safely seems higher cost, because someone might reach for it & see no error handling and decide it's not worth it.

(this API feels close to gets (which, admittedly, is maximally unsafe - you can't control its input - but that's /usually/ the case with most of these uleb readings - except these sort of special cases where the input is built into the code/file, like tblgen))

adrian-prantl · 2024-03-28T22:22:42Z

have invested in it quite a bit, libObject being robust to corrupted/arbitrary inputs

That is very true. In fact we're adopting libObject more and more in LLDB and LLDB needs to be really resilient against invalid input, since it's very easy to get into a situation where it needs to parse information out of corrupted memory.

adrian-prantl · 2024-03-28T22:29:27Z

(this API feels close to gets (which, admittedly, is maximally unsafe - you can't control its input - but that's /usually/ the case with most of these uleb readings - except these sort of special cases where the input is built into the code/file, like tblgen))

This API will read at most 9 bytes out of bounds, but evidently any byte out of bounds is one too many. When I reviewed the original patch I was under the assumption that users of this API would check for the bounds in their outer parse loop, but maybe we're not saving a ton of performance by omitting the bounds check here.

dwblaikie · 2024-03-29T04:26:25Z

(this API feels close to gets (which, admittedly, is maximally unsafe - you can't control its input - but that's /usually/ the case with most of these uleb readings - except these sort of special cases where the input is built into the code/file, like tblgen))

This API will read at most 9 bytes out of bounds, but evidently any byte out of bounds is one too many. When I reviewed the original patch I was under the assumption that users of this API would check for the bounds in their outer parse loop, but maybe we're not saving a ton of performance by omitting the bounds check here.

Will it? If the ULEB is non-canonical, I thought it could read arbitrarily out of bounds? How's the 9 byte limit manifest?

adrian-prantl · 2024-03-29T17:57:27Z

llvm-project/llvm/include/llvm/Support/LEB128.h

Line 131 in a1a8bb1

inline uint64_t decodeULEB128(const uint8_t *p, unsigned *n = nullptr,

It stops if the decoded value doesn't fit into a uint64_t, which I think is 9 bytes in the worst case. Maybe 10?

EDIT: [That wasn't David's point]

adrian-prantl · 2024-03-29T17:57:49Z

If the ULEB is non-canonical,

Ah. Sorry, that is a good point!

dwblaikie · 2024-04-22T21:27:24Z

@MaskRay I think the basic request/requirement here should be that these new APIs be no worse at error handling than the ones they're wrapping: Please add the extra optional error reporting parameters.

Ideally, imo, we should be making error handling more explicit/non-optional than these APIs are. Ideally new APIs (including these wrappers) would include non-optional error handling (Error/Expected returning)

MaskRay · 2024-04-24T23:58:54Z

Sorry, just saw the discussion. Added #90006

(I believe removing default arguments isn't a lot useful as users can always use the existing decodeULEB128 with default arguments)

dwblaikie · 2024-04-30T04:33:02Z

(I believe removing default arguments isn't a lot useful as users can always use the existing decodeULEB128 with default arguments)

nod I understand. My concern is that this adds friction to users doing error checking-makes it easier to reach for an API that not only doesn't encourage it (the original API) but doesn't allow it - meaning the author has to choose between a more convenient API, and maybe doing error checking - this seems like it'd discourage error checking, which would be unfortunate.

lhames · 2024-04-30T05:26:11Z

Yeah. From an API design standpoint I'd love the error-checking variants to be the go-to ones, with something like "decodeULEB128ThatIPromiseIsSafe" as the performance option. I'd also be in favor of Expected over optional error arguments to ensure that the errors actually are checked / propagated.

Follow-up to #85739 to encourage error checking. We make `end` mandatory and add decodeULEB128AndIncUnsafe to be used without `end`. Pull Request: #90006

[𝘀𝗽𝗿] initial version

ae37a01

Created using spr 1.3.5-bogner

llvmbot added llvm:support objectyaml labels Mar 19, 2024

MaskRay requested review from JDevlieghere, adrian-prantl and dwblaikie March 19, 2024 06:19

adrian-prantl approved these changes Mar 19, 2024

View reviewed changes

don't update macho2yaml.cpp

472a2ed

Created using spr 1.3.5-bogner

MaskRay merged commit 35bf8e7 into main Mar 19, 2024
3 of 4 checks passed

MaskRay deleted the users/MaskRay/spr/support-add-decodeuleb128andincdecodesleb128andinc branch March 19, 2024 22:40

MaskRay mentioned this pull request Apr 24, 2024

[Support] Add end/error to decode[US]LEB128AndInc #90006

Merged

MaskRay added a commit that referenced this pull request May 8, 2024

[Support] Add end/error to decode[US]LEB128AndInc

efad149

Follow-up to #85739 to encourage error checking. We make `end` mandatory and add decodeULEB128AndIncUnsafe to be used without `end`. Pull Request: #90006

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Support] Add decodeULEB128AndInc/decodeSLEB128AndInc #85739

[Support] Add decodeULEB128AndInc/decodeSLEB128AndInc #85739

MaskRay commented Mar 19, 2024

llvmbot commented Mar 19, 2024 •

edited

adrian-prantl left a comment

dwblaikie commented Mar 21, 2024

adrian-prantl commented Mar 21, 2024

MaskRay commented Mar 22, 2024

dwblaikie commented Mar 25, 2024

MaskRay commented Mar 25, 2024

dwblaikie commented Mar 28, 2024

adrian-prantl commented Mar 28, 2024

adrian-prantl commented Mar 28, 2024

dwblaikie commented Mar 29, 2024

adrian-prantl commented Mar 29, 2024 •

edited

adrian-prantl commented Mar 29, 2024

dwblaikie commented Apr 22, 2024

MaskRay commented Apr 24, 2024

dwblaikie commented Apr 30, 2024

lhames commented Apr 30, 2024

[Support] Add decodeULEB128AndInc/decodeSLEB128AndInc #85739

[Support] Add decodeULEB128AndInc/decodeSLEB128AndInc #85739

Conversation

MaskRay commented Mar 19, 2024

llvmbot commented Mar 19, 2024 • edited

adrian-prantl left a comment

Choose a reason for hiding this comment

dwblaikie commented Mar 21, 2024

adrian-prantl commented Mar 21, 2024

MaskRay commented Mar 22, 2024

dwblaikie commented Mar 25, 2024

MaskRay commented Mar 25, 2024

dwblaikie commented Mar 28, 2024

adrian-prantl commented Mar 28, 2024

adrian-prantl commented Mar 28, 2024

dwblaikie commented Mar 29, 2024

adrian-prantl commented Mar 29, 2024 • edited

adrian-prantl commented Mar 29, 2024

dwblaikie commented Apr 22, 2024

MaskRay commented Apr 24, 2024

dwblaikie commented Apr 30, 2024

lhames commented Apr 30, 2024

llvmbot commented Mar 19, 2024 •

edited

adrian-prantl commented Mar 29, 2024 •

edited