[Support/Hash functions] Change the final() and result() of the h…

…ashing functions to return an array of bytes Returning `std::array<uint8_t, N>` is better ergonomics for the hashing functions usage, instead of a `StringRef`: * When returning `StringRef`, client code is "jumping through hoops" to do string manipulations instead of dealing with fixed array of bytes directly, which is more natural * Returning `std::array<uint8_t, N>` avoids the need for the hasher classes to keep a field just for the purpose of wrapping it and returning it as a `StringRef` As part of this patch also: * Introduce `TruncatedBLAKE3` which is useful for using BLAKE3 as the hasher type for `HashBuilder` with non-default hash sizes. * Make `MD5Result` inherit from `std::array<uint8_t, 16>` which improves & simplifies its API. Differential Revision: https://reviews.llvm.org/D123100
llvm · Apr 6, 2022 · 330268b · 330268b
1 parent acfc785
commit 330268b
Show file tree

Hide file tree

Showing 29 changed files with 148 additions and 105 deletions.
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
@@ -820,7 +820,8 @@ void DebugAbbrevWriter::addUnitAbbreviations(DWARFUnit &Unit) {
   auto hashAndAddAbbrev = [&](StringRef AbbrevData) -> bool {
     llvm::SHA1 Hasher;
     Hasher.update(AbbrevData);
-    StringRef Key = Hasher.final();
+    std::array<uint8_t, 20> Hash = Hasher.final();
+    StringRef Key((const char *)Hash.data(), Hash.size());
     auto Iter = AbbrevDataCache.find(Key);
     if (Iter != AbbrevDataCache.end()) {
       UnitsAbbrevData[&Unit] = Iter->second.get();

diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
@@ -71,8 +71,8 @@ struct ASTFileSignature : std::array<uint8_t, 20> {
     return Value;
   }
 
-  static ASTFileSignature create(StringRef Bytes) {
-    return create(Bytes.bytes_begin(), Bytes.bytes_end());
+  static ASTFileSignature create(std::array<uint8_t, 20> Bytes) {
+    return ASTFileSignature(std::move(Bytes));
   }
 
   static ASTFileSignature createDISentinel() {

diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
@@ -1117,17 +1117,15 @@ std::pair<ASTFileSignature, ASTFileSignature>
 ASTWriter::createSignature(StringRef AllBytes, StringRef ASTBlockBytes) {
   llvm::SHA1 Hasher;
   Hasher.update(ASTBlockBytes);
-  auto Hash = Hasher.result();
-  ASTFileSignature ASTBlockHash = ASTFileSignature::create(Hash);
+  ASTFileSignature ASTBlockHash = ASTFileSignature::create(Hasher.result());
 
   // Add the remaining bytes (i.e. bytes before the unhashed control block that
   // are not part of the AST block).
   Hasher.update(
       AllBytes.take_front(ASTBlockBytes.bytes_end() - AllBytes.bytes_begin()));
   Hasher.update(
       AllBytes.take_back(AllBytes.bytes_end() - ASTBlockBytes.bytes_end()));
-  Hash = Hasher.result();
-  ASTFileSignature Signature = ASTFileSignature::create(Hash);
+  ASTFileSignature Signature = ASTFileSignature::create(Hasher.result());
 
   return std::make_pair(ASTBlockHash, Signature);
 }

diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
@@ -1202,7 +1202,7 @@ void CodeSignatureSection::writeHashes(uint8_t *buf) const {
                     std::min(codeEnd - code, static_cast<ssize_t>(blockSize)));
     SHA256 hasher;
     hasher.update(block);
-    StringRef hash = hasher.final();
+    std::array<uint8_t, 32> hash = hasher.final();
     assert(hash.size() == hashSize);
     memcpy(hashes, hash.data(), hashSize);
     code += blockSize;

diff --git a/llvm/include/llvm/Support/BLAKE3.h b/llvm/include/llvm/Support/BLAKE3.h
@@ -34,7 +34,7 @@ namespace llvm {
 template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
 using BLAKE3Result = std::array<uint8_t, NumBytes>;
 
-/// A class that wrap the BLAKE3 algorithm.
+/// A class that wraps the BLAKE3 algorithm.
 class BLAKE3 {
 public:
   BLAKE3() { init(); }
@@ -70,6 +70,17 @@ class BLAKE3 {
     return Result;
   }
 
+  /// Return the current output for the digested data since the last call to
+  /// init().
+  ///
+  /// Other hash functions distinguish between \p result() and \p final(), with
+  /// \p result() allowing more calls into \p update(), but there's no
+  // difference for the BLAKE3 hash function.
+  template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
+  BLAKE3Result<NumBytes> result() {
+    return final<NumBytes>();
+  }
+
   /// Returns a BLAKE3 hash for the given data.
   template <size_t NumBytes = LLVM_BLAKE3_OUT_LEN>
   static BLAKE3Result<NumBytes> hash(ArrayRef<uint8_t> Data) {
@@ -82,6 +93,32 @@ class BLAKE3 {
   llvm_blake3_hasher Hasher;
 };
 
+/// Like \p BLAKE3 but using a class-level template parameter for specifying the
+/// hash size of the \p final() and \p result() functions.
+///
+/// This is useful for using BLAKE3 as the hasher type for \p HashBuilder with
+/// non-default hash sizes.
+template <size_t NumBytes> class TruncatedBLAKE3 : public BLAKE3 {
+public:
+  /// Finalize the hasher and put the result in \p Result.
+  /// This doesn't modify the hasher itself, and it's possible to finalize again
+  /// after adding more input.
+  void final(BLAKE3Result<NumBytes> &Result) { return BLAKE3::final(Result); }
+
+  /// Finalize the hasher and return an output of any length, given in bytes.
+  /// This doesn't modify the hasher itself, and it's possible to finalize again
+  /// after adding more input.
+  BLAKE3Result<NumBytes> final() { return BLAKE3::final<NumBytes>(); }
+
+  /// Return the current output for the digested data since the last call to
+  /// init().
+  ///
+  /// Other hash functions distinguish between \p result() and \p final(), with
+  /// \p result() allowing more calls into \p update(), but there's no
+  // difference for the BLAKE3 hash function.
+  BLAKE3Result<NumBytes> result() { return BLAKE3::result<NumBytes>(); }
+};
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Support/HashBuilder.h b/llvm/include/llvm/Support/HashBuilder.h
@@ -39,6 +39,9 @@ struct IsHashableData
 /// Declares the hasher member, and functions forwarding directly to the hasher.
 template <typename HasherT> class HashBuilderBase {
 public:
+  template <typename HasherT_ = HasherT>
+  using HashResultTy = decltype(std::declval<HasherT_ &>().final());
+
   HasherT &getHasher() { return Hasher; }
 
   /// Forward to `HasherT::update(ArrayRef<uint8_t>)`.
@@ -59,12 +62,12 @@ template <typename HasherT> class HashBuilderBase {
   }
 
   /// Forward to `HasherT::final()` if available.
-  template <typename HasherT_ = HasherT> StringRef final() {
+  template <typename HasherT_ = HasherT> HashResultTy<HasherT_> final() {
     return this->getHasher().final();
   }
 
   /// Forward to `HasherT::result()` if available.
-  template <typename HasherT_ = HasherT> StringRef result() {
+  template <typename HasherT_ = HasherT> HashResultTy<HasherT_> result() {
     return this->getHasher().result();
   }
 

diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
@@ -40,26 +40,19 @@ template <typename T> class ArrayRef;
 
 class MD5 {
 public:
-  struct MD5Result {
-    std::array<uint8_t, 16> Bytes;
-
-    operator std::array<uint8_t, 16>() const { return Bytes; }
-
-    const uint8_t &operator[](size_t I) const { return Bytes[I]; }
-    uint8_t &operator[](size_t I) { return Bytes[I]; }
-
+  struct MD5Result : public std::array<uint8_t, 16> {
     SmallString<32> digest() const;
 
     uint64_t low() const {
       // Our MD5 implementation returns the result in little endian, so the low
       // word is first.
       using namespace support;
-      return endian::read<uint64_t, little, unaligned>(Bytes.data());
+      return endian::read<uint64_t, little, unaligned>(data());
     }
 
     uint64_t high() const {
       using namespace support;
-      return endian::read<uint64_t, little, unaligned>(Bytes.data() + 8);
+      return endian::read<uint64_t, little, unaligned>(data() + 8);
     }
     std::pair<uint64_t, uint64_t> words() const {
       using namespace support;
@@ -78,20 +71,20 @@ class MD5 {
   /// Finishes off the hash and puts the result in result.
   void final(MD5Result &Result);
 
-  /// Finishes off the hash, and returns a reference to the 16-byte hash data.
-  StringRef final();
+  /// Finishes off the hash, and returns the 16-byte hash data.
+  MD5Result final();
 
-  /// Finishes off the hash, and returns a reference to the 16-byte hash data.
+  /// Finishes off the hash, and returns the 16-byte hash data.
   /// This is suitable for getting the MD5 at any time without invalidating the
   /// internal state, so that more calls can be made into `update`.
-  StringRef result();
+  MD5Result result();
 
   /// Translates the bytes in \p Res to a hex string that is
   /// deposited into \p Str. The result will be of length 32.
   static void stringifyResult(MD5Result &Result, SmallVectorImpl<char> &Str);
 
   /// Computes the hash for a given bytes.
-  static std::array<uint8_t, 16> hash(ArrayRef<uint8_t> Data);
+  static MD5Result hash(ArrayRef<uint8_t> Data);
 
 private:
   // Any 32-bit or wider unsigned integer data type will do.
@@ -109,15 +102,9 @@ class MD5 {
     MD5_u32plus block[16];
   } InternalState;
 
-  MD5Result Result;
-
   const uint8_t *body(ArrayRef<uint8_t> Data);
 };
 
-inline bool operator==(const MD5::MD5Result &LHS, const MD5::MD5Result &RHS) {
-  return LHS.Bytes == RHS.Bytes;
-}
-
 /// Helper to compute and return lower 64 bits of the given string's MD5 hash.
 inline uint64_t MD5Hash(StringRef Str) {
   using namespace support;

diff --git a/llvm/include/llvm/Support/SHA1.h b/llvm/include/llvm/Support/SHA1.h
@@ -36,17 +36,17 @@ class SHA1 {
   /// Digest more data.
   void update(StringRef Str);
 
-  /// Return a reference to the current raw 160-bits SHA1 for the digested data
+  /// Return the current raw 160-bits SHA1 for the digested data
   /// since the last call to init(). This call will add data to the internal
   /// state and as such is not suited for getting an intermediate result
   /// (see result()).
-  StringRef final();
+  std::array<uint8_t, 20> final();
 
-  /// Return a reference to the current raw 160-bits SHA1 for the digested data
+  /// Return the current raw 160-bits SHA1 for the digested data
   /// since the last call to init(). This is suitable for getting the SHA1 at
   /// any time without invalidating the internal state so that more calls can be
   /// made into update.
-  StringRef result();
+  std::array<uint8_t, 20> result();
 
   /// Returns a raw 160-bit SHA1 hash for the given data.
   static std::array<uint8_t, 20> hash(ArrayRef<uint8_t> Data);
@@ -68,14 +68,13 @@ class SHA1 {
     uint8_t BufferOffset;
   } InternalState;
 
-  // Internal copy of the hash, populated and accessed on calls to result()
-  uint32_t HashResult[HASH_LENGTH / 4];
-
   // Helper
   void writebyte(uint8_t data);
   void hashBlock();
   void addUncounted(uint8_t data);
   void pad();
+
+  void final(std::array<uint32_t, HASH_LENGTH / 4> &HashResult);
 };
 
 } // end llvm namespace

diff --git a/llvm/include/llvm/Support/SHA256.h b/llvm/include/llvm/Support/SHA256.h
@@ -43,17 +43,17 @@ class SHA256 {
   /// Digest more data.
   void update(StringRef Str);
 
-  /// Return a reference to the current raw 256-bits SHA256 for the digested
+  /// Return the current raw 256-bits SHA256 for the digested
   /// data since the last call to init(). This call will add data to the
   /// internal state and as such is not suited for getting an intermediate
   /// result (see result()).
-  StringRef final();
+  std::array<uint8_t, 32> final();
 
-  /// Return a reference to the current raw 256-bits SHA256 for the digested
+  /// Return the current raw 256-bits SHA256 for the digested
   /// data since the last call to init(). This is suitable for getting the
   /// SHA256 at any time without invalidating the internal state so that more
   /// calls can be made into update.
-  StringRef result();
+  std::array<uint8_t, 32> result();
 
   /// Returns a raw 256-bit SHA256 hash for the given data.
   static std::array<uint8_t, 32> hash(ArrayRef<uint8_t> Data);
@@ -75,14 +75,13 @@ class SHA256 {
     uint8_t BufferOffset;
   } InternalState;
 
-  // Internal copy of the hash, populated and accessed on calls to result()
-  uint32_t HashResult[HASH_LENGTH / 4];
-
   // Helper
   void writebyte(uint8_t data);
   void hashBlock();
   void addUncounted(uint8_t data);
   void pad();
+
+  void final(std::array<uint32_t, HASH_LENGTH / 4> &HashResult);
 };
 
 } // namespace llvm

diff --git a/llvm/include/llvm/Support/raw_sha1_ostream.h b/llvm/include/llvm/Support/raw_sha1_ostream.h
@@ -30,7 +30,7 @@ class raw_sha1_ostream : public raw_ostream {
 
 public:
   /// Return the current SHA1 hash for the content of the stream
-  StringRef sha1() {
+  std::array<uint8_t, 20> sha1() {
     flush();
     return State.result();
   }

diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -903,22 +903,22 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
         else {
           // Recompute the hash and compare it to the one in the bitcode
           SHA1 Hasher;
-          StringRef Hash;
+          std::array<uint8_t, 20> Hash;
           Hasher.update(*CheckHash);
           {
             int BlockSize = (CurrentRecordPos / 8) - BlockEntryPos;
             auto Ptr = Stream.getPointerToByte(BlockEntryPos, BlockSize);
             Hasher.update(ArrayRef<uint8_t>(Ptr, BlockSize));
             Hash = Hasher.result();
           }
-          std::array<char, 20> RecordedHash;
+          std::array<uint8_t, 20> RecordedHash;
           int Pos = 0;
           for (auto &Val : Record) {
             assert(!(Val >> 32) && "Unexpected high bits set");
             support::endian::write32be(&RecordedHash[Pos], Val);
             Pos += 4;
           }
-          if (Hash == StringRef(RecordedHash.data(), RecordedHash.size()))
+          if (Hash == RecordedHash)
             O->OS << " (match)";
           else
             O->OS << " (!mismatch!)";

diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4387,7 +4387,7 @@ void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) {
     uint32_t Vals[5];
     Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
                                     Buffer.size() - BlockStartPos));
-    StringRef Hash = Hasher.result();
+    std::array<uint8_t, 20> Hash = Hasher.result();
     for (int Pos = 0; Pos < 20; Pos += 4) {
       Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
     }

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3536,6 +3536,6 @@ Optional<MD5::MD5Result> DwarfDebug::getMD5AsBytes(const DIFile *File) const {
   // An MD5 checksum is 16 bytes.
   std::string ChecksumString = fromHex(Checksum->Value);
   MD5::MD5Result CKMem;
-  std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.Bytes.data());
+  std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.data());
   return CKMem;
 }
diff --git a/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp b/llvm/lib/DebugInfo/CodeView/TypeHashing.cpp
@@ -76,5 +76,6 @@ GloballyHashedType::hashType(ArrayRef<uint8_t> RecordData,
   auto TrailingBytes = RecordData.drop_front(Off);
   S.update(TrailingBytes);
 
-  return {S.final().take_back(8)};
+  std::array<uint8_t, 20> Hash = S.final();
+  return {ArrayRef<uint8_t>(Hash).take_back(8)};
 }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -341,7 +341,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
               errc::invalid_argument,
               "failed to parse file entry because the MD5 hash is invalid");
         std::uninitialized_copy_n(Value.getAsBlock().getValue().begin(), 16,
-                                  FileEntry.Checksum.Bytes.begin());
+                                  FileEntry.Checksum.begin());
         break;
       default:
         break;

diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
@@ -387,8 +387,7 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
   if (EmitMD5) {
     const MD5::MD5Result &Cksum = *DwarfFile.Checksum;
     MCOS->emitBinaryData(
-        StringRef(reinterpret_cast<const char *>(Cksum.Bytes.data()),
-                  Cksum.Bytes.size()));
+        StringRef(reinterpret_cast<const char *>(Cksum.data()), Cksum.size()));
   }
   if (HasSource) {
     if (LineStr)

diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3573,8 +3573,8 @@ bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
     if (HasMD5) {
       MD5::MD5Result Sum;
       for (unsigned i = 0; i != 8; ++i) {
-        Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
-        Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
+        Sum[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
+        Sum[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
       }
       CKMem = Sum;
     }

diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -4907,8 +4907,8 @@ bool MasmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
     if (HasMD5) {
       MD5::MD5Result Sum;
       for (unsigned i = 0; i != 8; ++i) {
-        Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
-        Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
+        Sum[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
+        Sum[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
       }
       CKMem = Sum;
     }

diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
@@ -570,7 +570,7 @@ void MachOWriter::writeCodeSignatureData() {
                              static_cast<ssize_t>(CodeSignature.BlockSize)));
     SHA256 Hasher;
     Hasher.update(Block);
-    StringRef Hash = Hasher.final();
+    std::array<uint8_t, 32> Hash = Hasher.final();
     assert(Hash.size() == CodeSignature.HashSize);
     memcpy(CurrHashWritePosition, Hash.data(), CodeSignature.HashSize);
     CurrHashReadPosition += CodeSignature.BlockSize;