Skip to content

Commit

Permalink
[Support/Hash functions] Change the final() and result() of the h…
Browse files Browse the repository at this point in the history
…ashing functions to return an array of bytes

Returning `std::array<uint8_t, N>` is better ergonomics for the hashing functions usage, instead of a `StringRef`:

* When returning `StringRef`, client code is "jumping through hoops" to do string manipulations instead of dealing with fixed array of bytes directly, which is more natural
* Returning `std::array<uint8_t, N>` avoids the need for the hasher classes to keep a field just for the purpose of wrapping it and returning it as a `StringRef`

As part of this patch also:

* Introduce `TruncatedBLAKE3` which is useful for using BLAKE3 as the hasher type for `HashBuilder` with non-default hash sizes.
* Make `MD5Result` inherit from `std::array<uint8_t, 16>` which improves & simplifies its API.

Differential Revision: https://reviews.llvm.org/D123100
  • Loading branch information
akyrtzi committed Apr 6, 2022
1 parent acfc785 commit 330268b
Show file tree
Hide file tree
Showing 29 changed files with 148 additions and 105 deletions.
3 changes: 2 additions & 1 deletion bolt/lib/Core/DebugData.cpp
Expand Up @@ -820,7 +820,8 @@ void DebugAbbrevWriter::addUnitAbbreviations(DWARFUnit &Unit) {
auto hashAndAddAbbrev = [&](StringRef AbbrevData) -> bool {
llvm::SHA1 Hasher;
Hasher.update(AbbrevData);
StringRef Key = Hasher.final();
std::array<uint8_t, 20> Hash = Hasher.final();
StringRef Key((const char *)Hash.data(), Hash.size());
auto Iter = AbbrevDataCache.find(Key);
if (Iter != AbbrevDataCache.end()) {
UnitsAbbrevData[&Unit] = Iter->second.get();
Expand Down
4 changes: 2 additions & 2 deletions clang/include/clang/Basic/Module.h
Expand Up @@ -71,8 +71,8 @@ struct ASTFileSignature : std::array<uint8_t, 20> {
return Value;
}

static ASTFileSignature create(StringRef Bytes) {
return create(Bytes.bytes_begin(), Bytes.bytes_end());
static ASTFileSignature create(std::array<uint8_t, 20> Bytes) {
return ASTFileSignature(std::move(Bytes));
}

static ASTFileSignature createDISentinel() {
Expand Down
6 changes: 2 additions & 4 deletions clang/lib/Serialization/ASTWriter.cpp
Expand Up @@ -1117,17 +1117,15 @@ std::pair<ASTFileSignature, ASTFileSignature>
ASTWriter::createSignature(StringRef AllBytes, StringRef ASTBlockBytes) {
llvm::SHA1 Hasher;
Hasher.update(ASTBlockBytes);
auto Hash = Hasher.result();
ASTFileSignature ASTBlockHash = ASTFileSignature::create(Hash);
ASTFileSignature ASTBlockHash = ASTFileSignature::create(Hasher.result());

// Add the remaining bytes (i.e. bytes before the unhashed control block that
// are not part of the AST block).
Hasher.update(
AllBytes.take_front(ASTBlockBytes.bytes_end() - AllBytes.bytes_begin()));
Hasher.update(
AllBytes.take_back(AllBytes.bytes_end() - ASTBlockBytes.bytes_end()));
Hash = Hasher.result();
ASTFileSignature Signature = ASTFileSignature::create(Hash);
ASTFileSignature Signature = ASTFileSignature::create(Hasher.result());

return std::make_pair(ASTBlockHash, Signature);
}
Expand Down
2 changes: 1 addition & 1 deletion lld/MachO/SyntheticSections.cpp
Expand Up @@ -1202,7 +1202,7 @@ void CodeSignatureSection::writeHashes(uint8_t *buf) const {
std::min(codeEnd - code, static_cast<ssize_t>(blockSize)));
SHA256 hasher;
hasher.update(block);
StringRef hash = hasher.final();
std::array<uint8_t, 32> hash = hasher.final();
assert(hash.size() == hashSize);
memcpy(hashes, hash.data(), hashSize);
code += blockSize;
Expand Down
39 changes: 38 additions & 1 deletion llvm/include/llvm/Support/BLAKE3.h
Expand Up @@ -34,7 +34,7 @@ namespace llvm {
template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
using BLAKE3Result = std::array<uint8_t, NumBytes>;

/// A class that wrap the BLAKE3 algorithm.
/// A class that wraps the BLAKE3 algorithm.
class BLAKE3 {
public:
BLAKE3() { init(); }
Expand Down Expand Up @@ -70,6 +70,17 @@ class BLAKE3 {
return Result;
}

/// Return the current output for the digested data since the last call to
/// init().
///
/// Other hash functions distinguish between \p result() and \p final(), with
/// \p result() allowing more calls into \p update(), but there's no
// difference for the BLAKE3 hash function.
template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
BLAKE3Result<NumBytes> result() {
return final<NumBytes>();
}

/// Returns a BLAKE3 hash for the given data.
template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
static BLAKE3Result<NumBytes> hash(ArrayRef<uint8_t> Data) {
Expand All @@ -82,6 +93,32 @@ class BLAKE3 {
llvm_blake3_hasher Hasher;
};

/// Like \p BLAKE3 but using a class-level template parameter for specifying the
/// hash size of the \p final() and \p result() functions.
///
/// This is useful for using BLAKE3 as the hasher type for \p HashBuilder with
/// non-default hash sizes.
template <size_t NumBytes> class TruncatedBLAKE3 : public BLAKE3 {
public:
/// Finalize the hasher and put the result in \p Result.
/// This doesn't modify the hasher itself, and it's possible to finalize again
/// after adding more input.
void final(BLAKE3Result<NumBytes> &Result) { return BLAKE3::final(Result); }

/// Finalize the hasher and return an output of any length, given in bytes.
/// This doesn't modify the hasher itself, and it's possible to finalize again
/// after adding more input.
BLAKE3Result<NumBytes> final() { return BLAKE3::final<NumBytes>(); }

/// Return the current output for the digested data since the last call to
/// init().
///
/// Other hash functions distinguish between \p result() and \p final(), with
/// \p result() allowing more calls into \p update(), but there's no
// difference for the BLAKE3 hash function.
BLAKE3Result<NumBytes> result() { return BLAKE3::result<NumBytes>(); }
};

} // namespace llvm

#endif
7 changes: 5 additions & 2 deletions llvm/include/llvm/Support/HashBuilder.h
Expand Up @@ -39,6 +39,9 @@ struct IsHashableData
/// Declares the hasher member, and functions forwarding directly to the hasher.
template <typename HasherT> class HashBuilderBase {
public:
template <typename HasherT_ = HasherT>
using HashResultTy = decltype(std::declval<HasherT_ &>().final());

HasherT &getHasher() { return Hasher; }

/// Forward to `HasherT::update(ArrayRef<uint8_t>)`.
Expand All @@ -59,12 +62,12 @@ template <typename HasherT> class HashBuilderBase {
}

/// Forward to `HasherT::final()` if available.
template <typename HasherT_ = HasherT> StringRef final() {
template <typename HasherT_ = HasherT> HashResultTy<HasherT_> final() {
return this->getHasher().final();
}

/// Forward to `HasherT::result()` if available.
template <typename HasherT_ = HasherT> StringRef result() {
template <typename HasherT_ = HasherT> HashResultTy<HasherT_> result() {
return this->getHasher().result();
}

Expand Down
29 changes: 8 additions & 21 deletions llvm/include/llvm/Support/MD5.h
Expand Up @@ -40,26 +40,19 @@ template <typename T> class ArrayRef;

class MD5 {
public:
struct MD5Result {
std::array<uint8_t, 16> Bytes;

operator std::array<uint8_t, 16>() const { return Bytes; }

const uint8_t &operator[](size_t I) const { return Bytes[I]; }
uint8_t &operator[](size_t I) { return Bytes[I]; }

struct MD5Result : public std::array<uint8_t, 16> {
SmallString<32> digest() const;

uint64_t low() const {
// Our MD5 implementation returns the result in little endian, so the low
// word is first.
using namespace support;
return endian::read<uint64_t, little, unaligned>(Bytes.data());
return endian::read<uint64_t, little, unaligned>(data());
}

uint64_t high() const {
using namespace support;
return endian::read<uint64_t, little, unaligned>(Bytes.data() + 8);
return endian::read<uint64_t, little, unaligned>(data() + 8);
}
std::pair<uint64_t, uint64_t> words() const {
using namespace support;
Expand All @@ -78,20 +71,20 @@ class MD5 {
/// Finishes off the hash and puts the result in result.
void final(MD5Result &Result);

/// Finishes off the hash, and returns a reference to the 16-byte hash data.
StringRef final();
/// Finishes off the hash, and returns the 16-byte hash data.
MD5Result final();

/// Finishes off the hash, and returns a reference to the 16-byte hash data.
/// Finishes off the hash, and returns the 16-byte hash data.
/// This is suitable for getting the MD5 at any time without invalidating the
/// internal state, so that more calls can be made into `update`.
StringRef result();
MD5Result result();

/// Translates the bytes in \p Res to a hex string that is
/// deposited into \p Str. The result will be of length 32.
static void stringifyResult(MD5Result &Result, SmallVectorImpl<char> &Str);

/// Computes the hash for a given bytes.
static std::array<uint8_t, 16> hash(ArrayRef<uint8_t> Data);
static MD5Result hash(ArrayRef<uint8_t> Data);

private:
// Any 32-bit or wider unsigned integer data type will do.
Expand All @@ -109,15 +102,9 @@ class MD5 {
MD5_u32plus block[16];
} InternalState;

MD5Result Result;

const uint8_t *body(ArrayRef<uint8_t> Data);
};

inline bool operator==(const MD5::MD5Result &LHS, const MD5::MD5Result &RHS) {
return LHS.Bytes == RHS.Bytes;
}

/// Helper to compute and return lower 64 bits of the given string's MD5 hash.
inline uint64_t MD5Hash(StringRef Str) {
using namespace support;
Expand Down
13 changes: 6 additions & 7 deletions llvm/include/llvm/Support/SHA1.h
Expand Up @@ -36,17 +36,17 @@ class SHA1 {
/// Digest more data.
void update(StringRef Str);

/// Return a reference to the current raw 160-bits SHA1 for the digested data
/// Return the current raw 160-bits SHA1 for the digested data
/// since the last call to init(). This call will add data to the internal
/// state and as such is not suited for getting an intermediate result
/// (see result()).
StringRef final();
std::array<uint8_t, 20> final();

/// Return a reference to the current raw 160-bits SHA1 for the digested data
/// Return the current raw 160-bits SHA1 for the digested data
/// since the last call to init(). This is suitable for getting the SHA1 at
/// any time without invalidating the internal state so that more calls can be
/// made into update.
StringRef result();
std::array<uint8_t, 20> result();

/// Returns a raw 160-bit SHA1 hash for the given data.
static std::array<uint8_t, 20> hash(ArrayRef<uint8_t> Data);
Expand All @@ -68,14 +68,13 @@ class SHA1 {
uint8_t BufferOffset;
} InternalState;

// Internal copy of the hash, populated and accessed on calls to result()
uint32_t HashResult[HASH_LENGTH / 4];

// Helper
void writebyte(uint8_t data);
void hashBlock();
void addUncounted(uint8_t data);
void pad();

void final(std::array<uint32_t, HASH_LENGTH / 4> &HashResult);
};

} // end llvm namespace
Expand Down
13 changes: 6 additions & 7 deletions llvm/include/llvm/Support/SHA256.h
Expand Up @@ -43,17 +43,17 @@ class SHA256 {
/// Digest more data.
void update(StringRef Str);

/// Return a reference to the current raw 256-bits SHA256 for the digested
/// Return the current raw 256-bits SHA256 for the digested
/// data since the last call to init(). This call will add data to the
/// internal state and as such is not suited for getting an intermediate
/// result (see result()).
StringRef final();
std::array<uint8_t, 32> final();

/// Return a reference to the current raw 256-bits SHA256 for the digested
/// Return the current raw 256-bits SHA256 for the digested
/// data since the last call to init(). This is suitable for getting the
/// SHA256 at any time without invalidating the internal state so that more
/// calls can be made into update.
StringRef result();
std::array<uint8_t, 32> result();

/// Returns a raw 256-bit SHA256 hash for the given data.
static std::array<uint8_t, 32> hash(ArrayRef<uint8_t> Data);
Expand All @@ -75,14 +75,13 @@ class SHA256 {
uint8_t BufferOffset;
} InternalState;

// Internal copy of the hash, populated and accessed on calls to result()
uint32_t HashResult[HASH_LENGTH / 4];

// Helper
void writebyte(uint8_t data);
void hashBlock();
void addUncounted(uint8_t data);
void pad();

void final(std::array<uint32_t, HASH_LENGTH / 4> &HashResult);
};

} // namespace llvm
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/Support/raw_sha1_ostream.h
Expand Up @@ -30,7 +30,7 @@ class raw_sha1_ostream : public raw_ostream {

public:
/// Return the current SHA1 hash for the content of the stream
StringRef sha1() {
std::array<uint8_t, 20> sha1() {
flush();
return State.result();
}
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
Expand Up @@ -903,22 +903,22 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
else {
// Recompute the hash and compare it to the one in the bitcode
SHA1 Hasher;
StringRef Hash;
std::array<uint8_t, 20> Hash;
Hasher.update(*CheckHash);
{
int BlockSize = (CurrentRecordPos / 8) - BlockEntryPos;
auto Ptr = Stream.getPointerToByte(BlockEntryPos, BlockSize);
Hasher.update(ArrayRef<uint8_t>(Ptr, BlockSize));
Hash = Hasher.result();
}
std::array<char, 20> RecordedHash;
std::array<uint8_t, 20> RecordedHash;
int Pos = 0;
for (auto &Val : Record) {
assert(!(Val >> 32) && "Unexpected high bits set");
support::endian::write32be(&RecordedHash[Pos], Val);
Pos += 4;
}
if (Hash == StringRef(RecordedHash.data(), RecordedHash.size()))
if (Hash == RecordedHash)
O->OS << " (match)";
else
O->OS << " (!mismatch!)";
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
Expand Up @@ -4387,7 +4387,7 @@ void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) {
uint32_t Vals[5];
Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
Buffer.size() - BlockStartPos));
StringRef Hash = Hasher.result();
std::array<uint8_t, 20> Hash = Hasher.result();
for (int Pos = 0; Pos < 20; Pos += 4) {
Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
Expand Up @@ -3536,6 +3536,6 @@ Optional<MD5::MD5Result> DwarfDebug::getMD5AsBytes(const DIFile *File) const {
// An MD5 checksum is 16 bytes.
std::string ChecksumString = fromHex(Checksum->Value);
MD5::MD5Result CKMem;
std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.Bytes.data());
std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.data());
return CKMem;
}
3 changes: 2 additions & 1 deletion llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
Expand Up @@ -76,5 +76,6 @@ GloballyHashedType::hashType(ArrayRef<uint8_t> RecordData,
auto TrailingBytes = RecordData.drop_front(Off);
S.update(TrailingBytes);

return {S.final().take_back(8)};
std::array<uint8_t, 20> Hash = S.final();
return {ArrayRef<uint8_t>(Hash).take_back(8)};
}
2 changes: 1 addition & 1 deletion llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
Expand Up @@ -341,7 +341,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
errc::invalid_argument,
"failed to parse file entry because the MD5 hash is invalid");
std::uninitialized_copy_n(Value.getAsBlock().getValue().begin(), 16,
FileEntry.Checksum.Bytes.begin());
FileEntry.Checksum.begin());
break;
default:
break;
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/MC/MCDwarf.cpp
Expand Up @@ -387,8 +387,7 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
if (EmitMD5) {
const MD5::MD5Result &Cksum = *DwarfFile.Checksum;
MCOS->emitBinaryData(
StringRef(reinterpret_cast<const char *>(Cksum.Bytes.data()),
Cksum.Bytes.size()));
StringRef(reinterpret_cast<const char *>(Cksum.data()), Cksum.size()));
}
if (HasSource) {
if (LineStr)
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/MC/MCParser/AsmParser.cpp
Expand Up @@ -3573,8 +3573,8 @@ bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
if (HasMD5) {
MD5::MD5Result Sum;
for (unsigned i = 0; i != 8; ++i) {
Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
Sum[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
Sum[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
}
CKMem = Sum;
}
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/MC/MCParser/MasmParser.cpp
Expand Up @@ -4907,8 +4907,8 @@ bool MasmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
if (HasMD5) {
MD5::MD5Result Sum;
for (unsigned i = 0; i != 8; ++i) {
Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
Sum[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
Sum[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
}
CKMem = Sum;
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/ObjCopy/MachO/MachOWriter.cpp
Expand Up @@ -570,7 +570,7 @@ void MachOWriter::writeCodeSignatureData() {
static_cast<ssize_t>(CodeSignature.BlockSize)));
SHA256 Hasher;
Hasher.update(Block);
StringRef Hash = Hasher.final();
std::array<uint8_t, 32> Hash = Hasher.final();
assert(Hash.size() == CodeSignature.HashSize);
memcpy(CurrHashWritePosition, Hash.data(), CodeSignature.HashSize);
CurrHashReadPosition += CodeSignature.BlockSize;
Expand Down

0 comments on commit 330268b

Please sign in to comment.