Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions llvm/include/llvm/Support/LEB128.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,34 @@ LLVM_ABI extern unsigned getULEB128Size(uint64_t Value);
/// Utility function to get the size of the SLEB128-encoded value.
LLVM_ABI extern unsigned getSLEB128Size(int64_t Value);

// Unsigned Counted LEB128: A variant of LEB128 where the length information is
// determined by counting trailing zero bits in the first byte. Specifically, if
// the first byte has n-1 trailing zeros, then the encoded integer occupies n
// bytes total. The special case of a zero first byte signals a 9-byte encoding.
//
// The remaining bits in the first byte, plus all subsequent bytes, contain the
// actual value in little-endian order.

// clang-format off
// xxxxxxx1: 7 value bits, 1 byte
// xxxxxx10 xxxxxxxx: 14 value bits, 2 bytes
// xxxxx100 xxxxxxxx xxxxxxxx: 21 value bits, 3 bytes
// xxxx1000 xxxxxxxx xxxxxxxx xxxxxxxx: 28 value bits, 4 bytes
// xxx10000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 35 value bits, 5 bytes
// xx100000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 42 value bits, 6 bytes
// x1000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 49 value bits, 7 bytes
// 10000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 56 value bits, 8 bytes
//
// 00000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 64 value bits, 9 bytes
// The last byte should not be 0.
// clang-format on
LLVM_ABI void encodeUCLeb128(uint64_t x, raw_ostream &os);
LLVM_ABI uint64_t getUCLeb128(const uint8_t *&p, const uint8_t *end);
LLVM_ABI uint64_t getUCLeb128Unsafe(const uint8_t *&p);

// Note: If we introduce signed version of CLEB128, we should use sign extension
// instead of zig-zag encoding. Sign extension actually generates faster code.

} // namespace llvm

#endif // LLVM_SUPPORT_LEB128_H
93 changes: 92 additions & 1 deletion llvm/lib/Support/LEB128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
//===----------------------------------------------------------------------===//

#include "llvm/Support/LEB128.h"
#include "llvm/ADT/bit.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/raw_ostream.h"

using namespace llvm;
using namespace llvm::support;

namespace llvm {

Expand Down Expand Up @@ -39,5 +46,89 @@ unsigned getSLEB128Size(int64_t Value) {
} while (IsMore);
return Size;
}

} // namespace llvm

void llvm::encodeUCLeb128(uint64_t x, raw_ostream &os) {
// Fast path for n == 1
if (x < 128) {
os.write((x << 1) | 1);
return;
}

unsigned significantBits = 64 - countl_zero(x);
unsigned n = (significantBits + 6) / 7;
if (n > 8) {
// 9 bytes: 00000000 xxxxxxxx ...
os.write(0);
endian::write(os, x, endianness::little);
return;
}

uint64_t tagged = endian::byte_swap((x << n) | ((uint64_t)1 << (n - 1)),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: one could use the ulittle64_t type here to avoid the explicit byte_swap. The reader still has to think about endianness because you have to know that the bytes you care about start at offset zero, but that seems unavoidable.

endianness::little);
os.write((const char *)&tagged, n);
}

template <int n>
static inline uint64_t getUCLeb128Case(const uint8_t *&p, uint8_t byte) {
uint64_t val = byte >> n;
int shift = 8 - n;
for (int i = 1; i < n; ++i) {
val |= (uint64_t)p[i] << shift;
shift += 8;
}
p += n;
return val;
}

template <bool CheckBounds>
static uint64_t getUCLeb128Impl(const uint8_t *&p, const uint8_t *end) {
if constexpr (CheckBounds) {
if (p >= end)
return 0;
}
// Fast path for n == 1
uint8_t b0 = p[0];
if (b0 & 1) {
++p;
return b0 >> 1;
}

unsigned n = llvm::countr_zero(b0) + 1;
if constexpr (CheckBounds) {
if (end - p < n)
return 0;
}
// Note: If n < 9 and we allow out-of-bounds read, we can use read64le(p) <<
// (64-8*n) >> (64-7*n) instead of the following switch statement.
switch (n) {
case 1:
return getUCLeb128Case<1>(p, b0);
case 2:
return getUCLeb128Case<2>(p, b0);
case 3:
return getUCLeb128Case<3>(p, b0);
case 4:
return getUCLeb128Case<4>(p, b0);
case 5:
return getUCLeb128Case<5>(p, b0);
case 6:
return getUCLeb128Case<6>(p, b0);
case 7:
return getUCLeb128Case<7>(p, b0);
case 8:
return getUCLeb128Case<8>(p, b0);
default:
// 9 bytes: 00000000 xxxxxxxx ...
p += 9;
return endian::read64le(p - 8);
}
}

uint64_t llvm::getUCLeb128(const uint8_t *&p, const uint8_t *end) {
return getUCLeb128Impl<true>(p, end);
}

uint64_t llvm::getUCLeb128Unsafe(const uint8_t *&p) {
return getUCLeb128Impl<false>(p, nullptr);
}
86 changes: 86 additions & 0 deletions llvm/unittests/Support/LEB128Test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,4 +474,90 @@ TEST(LEB128Test, ULEB128Size) {
EXPECT_EQ(10u, getULEB128Size(UINT64_MAX));
}

TEST(CLeb128Test, get) {
#define EXPECT_CLEB128(VALUE, EXPECTED, SIZE) \
do { \
const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE); \
const uint8_t *P = V; \
const uint8_t *End = V + sizeof(VALUE) - 1; \
uint64_t Result = getUCLeb128(P, End); \
EXPECT_EQ(Result, EXPECTED); \
EXPECT_EQ(P - V, SIZE); \
P = V; \
Result = getUCLeb128Unsafe(P); \
EXPECT_EQ(Result, EXPECTED); \
EXPECT_EQ(P - V, SIZE); \
} while (0)

// Fast path: single byte with LSB = 1 (value = byte >> 1)
EXPECT_CLEB128("\x01", 0u, 1);
EXPECT_CLEB128("\x7f", 63u, 1);
EXPECT_CLEB128("\xff", 127u, 1);
EXPECT_CLEB128("\x02\x02", 128u, 2);
EXPECT_CLEB128("\x00\x00\x01\x00\x00\x00\x00\x00\x00", 256u, 9);

// Test (1<<56)-2
EXPECT_CLEB128("\x80\xfe\xff\xff\xff\xff\xff\xff", 0xfffffffffffffeu, 8);
EXPECT_CLEB128("\x00\xfe\xff\xff\xff\xff\xff\xff\x00", 0xfffffffffffffeu, 9);

#undef EXPECT_CLEB128

// Test bounds checking in safe version
{
const uint8_t data[] = {0x02, 0x02}; // 2-byte encoding for 128
const uint8_t *p = data;

// Insufficient buffer (should return 0)
p = data;
EXPECT_EQ(getUCLeb128(p, data + 1), 0u);
EXPECT_EQ(p, data);

// Empty buffer
p = data;
EXPECT_EQ(getUCLeb128(p, data), 0u);
EXPECT_EQ(p, data);
}

// Test 9-byte format bounds checking
{
const uint8_t data[] = {0x00, 0x01, 0x02, 0x03, 0x04,
0x05, 0x06, 0x07, 0x08, 0x09};
const uint8_t *p = data;

// Sufficient buffer for 9-byte format
EXPECT_EQ(getUCLeb128(p, data + 10), 0x0807060504030201ULL);

// Insufficient buffer for 9-byte format
p = data;
EXPECT_EQ(getUCLeb128(p, data + 8), 0u);
}
}

TEST(CLeb128Test, encode) {
// Test round-trip consistency for all encoding lengths.
const uint64_t vals[] = {
0, // 1 byte
128, // 2 bytes
(1ULL << 14) + 2, // 3 bytes
(1ULL << 21) + 3, // 4 bytes
(1ULL << 28) + 4, // 5 bytes
(1ULL << 35) + 5, // 6 bytes
(1ULL << 42) + 6, // 7 bytes
(1ULL << 49) + 7, // 8 bytes
UINT64_MAX / 2, // 9 bytes
UINT64_MAX - 1, // 9 bytes
};
for (uint64_t val : vals) {
std::string encoded;
raw_string_ostream os(encoded);
encodeUCLeb128(val, os);

const uint8_t *p0 = reinterpret_cast<const uint8_t *>(encoded.data());
const uint8_t *p = p0;
uint64_t decoded = getUCLeb128Unsafe(p);
EXPECT_EQ(val, decoded) << "Round-trip failed for value " << val;
EXPECT_EQ(p - p0, encoded.size());
}
}

} // anonymous namespace
Loading