diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h index 4e2262fb15c56..6cea80eb74bcf 100644 --- a/llvm/include/llvm/Support/LEB128.h +++ b/llvm/include/llvm/Support/LEB128.h @@ -252,6 +252,34 @@ LLVM_ABI extern unsigned getULEB128Size(uint64_t Value); /// Utility function to get the size of the SLEB128-encoded value. LLVM_ABI extern unsigned getSLEB128Size(int64_t Value); +// Unsigned Counted LEB128: A variant of LEB128 where the length information is +// determined by counting trailing zero bits in the first byte. Specifically, if +// the first byte has n-1 trailing zeros, then the encoded integer occupies n +// bytes total. The special case of a zero first byte signals a 9-byte encoding. +// +// The remaining bits in the first byte, plus all subsequent bytes, contain the +// actual value in little-endian order. + +// clang-format off +// xxxxxxx1: 7 value bits, 1 byte +// xxxxxx10 xxxxxxxx: 14 value bits, 2 bytes +// xxxxx100 xxxxxxxx xxxxxxxx: 21 value bits, 3 bytes +// xxxx1000 xxxxxxxx xxxxxxxx xxxxxxxx: 28 value bits, 4 bytes +// xxx10000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 35 value bits, 5 bytes +// xx100000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 42 value bits, 6 bytes +// x1000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 49 value bits, 7 bytes +// 10000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 56 value bits, 8 bytes +// +// 00000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 64 value bits, 9 bytes +// The last byte should not be 0. +// clang-format on +LLVM_ABI void encodeUCLeb128(uint64_t x, raw_ostream &os); +LLVM_ABI uint64_t getUCLeb128(const uint8_t *&p, const uint8_t *end); +LLVM_ABI uint64_t getUCLeb128Unsafe(const uint8_t *&p); + +// Note: If we introduce signed version of CLEB128, we should use sign extension +// instead of zig-zag encoding. Sign extension actually generates faster code. + } // namespace llvm #endif // LLVM_SUPPORT_LEB128_H diff --git a/llvm/lib/Support/LEB128.cpp b/llvm/lib/Support/LEB128.cpp index d41b673e9c8a5..703d13c46605c 100644 --- a/llvm/lib/Support/LEB128.cpp +++ b/llvm/lib/Support/LEB128.cpp @@ -12,6 +12,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/LEB128.h" +#include "llvm/ADT/bit.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::support; namespace llvm { @@ -39,5 +46,89 @@ unsigned getSLEB128Size(int64_t Value) { } while (IsMore); return Size; } - } // namespace llvm + +void llvm::encodeUCLeb128(uint64_t x, raw_ostream &os) { + // Fast path for n == 1 + if (x < 128) { + os.write((x << 1) | 1); + return; + } + + unsigned significantBits = 64 - countl_zero(x); + unsigned n = (significantBits + 6) / 7; + if (n > 8) { + // 9 bytes: 00000000 xxxxxxxx ... + os.write(0); + endian::write(os, x, endianness::little); + return; + } + + uint64_t tagged = endian::byte_swap((x << n) | ((uint64_t)1 << (n - 1)), + endianness::little); + os.write((const char *)&tagged, n); +} + +template +static inline uint64_t getUCLeb128Case(const uint8_t *&p, uint8_t byte) { + uint64_t val = byte >> n; + int shift = 8 - n; + for (int i = 1; i < n; ++i) { + val |= (uint64_t)p[i] << shift; + shift += 8; + } + p += n; + return val; +} + +template +static uint64_t getUCLeb128Impl(const uint8_t *&p, const uint8_t *end) { + if constexpr (CheckBounds) { + if (p >= end) + return 0; + } + // Fast path for n == 1 + uint8_t b0 = p[0]; + if (b0 & 1) { + ++p; + return b0 >> 1; + } + + unsigned n = llvm::countr_zero(b0) + 1; + if constexpr (CheckBounds) { + if (end - p < n) + return 0; + } + // Note: If n < 9 and we allow out-of-bounds read, we can use read64le(p) << + // (64-8*n) >> (64-7*n) instead of the following switch statement. + switch (n) { + case 1: + return getUCLeb128Case<1>(p, b0); + case 2: + return getUCLeb128Case<2>(p, b0); + case 3: + return getUCLeb128Case<3>(p, b0); + case 4: + return getUCLeb128Case<4>(p, b0); + case 5: + return getUCLeb128Case<5>(p, b0); + case 6: + return getUCLeb128Case<6>(p, b0); + case 7: + return getUCLeb128Case<7>(p, b0); + case 8: + return getUCLeb128Case<8>(p, b0); + default: + // 9 bytes: 00000000 xxxxxxxx ... + p += 9; + return endian::read64le(p - 8); + } +} + +uint64_t llvm::getUCLeb128(const uint8_t *&p, const uint8_t *end) { + return getUCLeb128Impl(p, end); +} + +uint64_t llvm::getUCLeb128Unsafe(const uint8_t *&p) { + return getUCLeb128Impl(p, nullptr); +} diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp index 0c54a2846903b..668fb48278998 100644 --- a/llvm/unittests/Support/LEB128Test.cpp +++ b/llvm/unittests/Support/LEB128Test.cpp @@ -474,4 +474,90 @@ TEST(LEB128Test, ULEB128Size) { EXPECT_EQ(10u, getULEB128Size(UINT64_MAX)); } +TEST(CLeb128Test, get) { +#define EXPECT_CLEB128(VALUE, EXPECTED, SIZE) \ + do { \ + const uint8_t *V = reinterpret_cast(VALUE); \ + const uint8_t *P = V; \ + const uint8_t *End = V + sizeof(VALUE) - 1; \ + uint64_t Result = getUCLeb128(P, End); \ + EXPECT_EQ(Result, EXPECTED); \ + EXPECT_EQ(P - V, SIZE); \ + P = V; \ + Result = getUCLeb128Unsafe(P); \ + EXPECT_EQ(Result, EXPECTED); \ + EXPECT_EQ(P - V, SIZE); \ + } while (0) + + // Fast path: single byte with LSB = 1 (value = byte >> 1) + EXPECT_CLEB128("\x01", 0u, 1); + EXPECT_CLEB128("\x7f", 63u, 1); + EXPECT_CLEB128("\xff", 127u, 1); + EXPECT_CLEB128("\x02\x02", 128u, 2); + EXPECT_CLEB128("\x00\x00\x01\x00\x00\x00\x00\x00\x00", 256u, 9); + + // Test (1<<56)-2 + EXPECT_CLEB128("\x80\xfe\xff\xff\xff\xff\xff\xff", 0xfffffffffffffeu, 8); + EXPECT_CLEB128("\x00\xfe\xff\xff\xff\xff\xff\xff\x00", 0xfffffffffffffeu, 9); + +#undef EXPECT_CLEB128 + + // Test bounds checking in safe version + { + const uint8_t data[] = {0x02, 0x02}; // 2-byte encoding for 128 + const uint8_t *p = data; + + // Insufficient buffer (should return 0) + p = data; + EXPECT_EQ(getUCLeb128(p, data + 1), 0u); + EXPECT_EQ(p, data); + + // Empty buffer + p = data; + EXPECT_EQ(getUCLeb128(p, data), 0u); + EXPECT_EQ(p, data); + } + + // Test 9-byte format bounds checking + { + const uint8_t data[] = {0x00, 0x01, 0x02, 0x03, 0x04, + 0x05, 0x06, 0x07, 0x08, 0x09}; + const uint8_t *p = data; + + // Sufficient buffer for 9-byte format + EXPECT_EQ(getUCLeb128(p, data + 10), 0x0807060504030201ULL); + + // Insufficient buffer for 9-byte format + p = data; + EXPECT_EQ(getUCLeb128(p, data + 8), 0u); + } +} + +TEST(CLeb128Test, encode) { + // Test round-trip consistency for all encoding lengths. + const uint64_t vals[] = { + 0, // 1 byte + 128, // 2 bytes + (1ULL << 14) + 2, // 3 bytes + (1ULL << 21) + 3, // 4 bytes + (1ULL << 28) + 4, // 5 bytes + (1ULL << 35) + 5, // 6 bytes + (1ULL << 42) + 6, // 7 bytes + (1ULL << 49) + 7, // 8 bytes + UINT64_MAX / 2, // 9 bytes + UINT64_MAX - 1, // 9 bytes + }; + for (uint64_t val : vals) { + std::string encoded; + raw_string_ostream os(encoded); + encodeUCLeb128(val, os); + + const uint8_t *p0 = reinterpret_cast(encoded.data()); + const uint8_t *p = p0; + uint64_t decoded = getUCLeb128Unsafe(p); + EXPECT_EQ(val, decoded) << "Round-trip failed for value " << val; + EXPECT_EQ(p - p0, encoded.size()); + } +} + } // anonymous namespace