[llvm-profdata] Do not create numerical strings for MD5 function name…

…s read from a Sample Profile. (#66164) This is phase 2 of the MD5 refactoring on Sample Profile following https://reviews.llvm.org/D147740 In previous implementation, when a MD5 Sample Profile is read, the reader first converts the MD5 values to strings, and then create a StringRef as if the numerical strings are regular function names, and later on IPO transformation passes perform string comparison over these numerical strings for profile matching. This is inefficient since it causes many small heap allocations. In this patch I created a class `ProfileFuncRef` that is similar to `StringRef` but it can represent a hash value directly without any conversion, and it will be more efficient (I will attach some benchmark results later) when being used in associative containers. ProfileFuncRef guarantees the same function name in string form or in MD5 form has the same hash value, which also fix a few issue in IPO passes where function matching/lookup only check for function name string, while returns a no-match if the profile is MD5. When testing on an internal large profile (> 1 GB, with more than 10 million functions), the full profile load time is reduced from 28 sec to 25 sec in average, and reading function offset table from 0.78s to 0.7s
llvm · Oct 17, 2023 · ef0e0ad · ef0e0ad
1 parent e90ec58
commit ef0e0ad
Show file tree

Hide file tree

Showing 22 changed files with 797 additions and 513 deletions.
diff --git a/llvm/include/llvm/ProfileData/FunctionId.h b/llvm/include/llvm/ProfileData/FunctionId.h
@@ -0,0 +1,213 @@
+//===--- FunctionId.h - Sample profile function object ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// Defines FunctionId class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_FUNCTIONID_H
+#define LLVM_PROFILEDATA_FUNCTIONID_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+
+namespace llvm {
+namespace sampleprof {
+
+/// This class represents a function that is read from a sample profile. It
+/// comes with two forms: a string or a hash code. The latter form is the 64-bit
+/// MD5 of the function name for efficient storage supported by ExtBinary
+/// profile format, and when reading the profile, this class can represent it
+/// without converting it to a string first.
+/// When representing a hash code, we utilize the LengthOrHashCode field to
+/// store it, and Name is set to null. When representing a string, it is same as
+/// StringRef.
+class FunctionId {
+
+  const char *Data = nullptr;
+
+  // Use uint64_t instead of size_t so that it can also hold a MD5 value on
+  // 32-bit system.
+  uint64_t LengthOrHashCode = 0;
+
+  /// Extension to memcmp to handle hash code representation. If both are hash
+  /// values, Lhs and Rhs are both null, function returns 0 (and needs an extra
+  /// comparison using getIntValue). If only one is hash code, it is considered
+  /// less than the StringRef one. Otherwise perform normal string comparison.
+  static int compareMemory(const char *Lhs, const char *Rhs, uint64_t Length) {
+    if (Lhs == Rhs)
+      return 0;
+    if (!Lhs)
+      return -1;
+    if (!Rhs)
+      return 1;
+    return ::memcmp(Lhs, Rhs, (size_t)Length);
+  }
+
+public:
+  FunctionId() = default;
+
+  /// Constructor from a StringRef.
+  explicit FunctionId(StringRef Str)
+      : Data(Str.data()), LengthOrHashCode(Str.size()) {
+  }
+
+  /// Constructor from a hash code.
+  explicit FunctionId(uint64_t HashCode)
+      : LengthOrHashCode(HashCode) {
+    assert(HashCode != 0);
+  }
+
+  /// Check for equality. Similar to StringRef::equals, but will also cover for
+  /// the case where one or both are hash codes. Comparing their int values are
+  /// sufficient. A hash code FunctionId is considered not equal to a StringRef
+  /// FunctionId regardless of actual contents.
+  bool equals(const FunctionId &Other) const {
+    return LengthOrHashCode == Other.LengthOrHashCode &&
+           compareMemory(Data, Other.Data, LengthOrHashCode) == 0;
+  }
+
+  /// Total order comparison. If both FunctionId are StringRef, this is the same
+  /// as StringRef::compare. If one of them is StringRef, it is considered
+  /// greater than the hash code FunctionId. Otherwise this is the the same
+  /// as comparing their int values.
+  int compare(const FunctionId &Other) const {
+    auto Res = compareMemory(
+        Data, Other.Data, std::min(LengthOrHashCode, Other.LengthOrHashCode));
+    if (Res != 0)
+      return Res;
+    if (LengthOrHashCode == Other.LengthOrHashCode)
+      return 0;
+    return LengthOrHashCode < Other.LengthOrHashCode ? -1 : 1;
+  }
+
+  /// Convert to a string, usually for output purpose. Use caution on return
+  /// value's lifetime when converting to StringRef.
+  std::string str() const {
+    if (Data)
+      return std::string(Data, LengthOrHashCode);
+    if (LengthOrHashCode != 0)
+      return std::to_string(LengthOrHashCode);
+    return std::string();
+  }
+
+  /// Convert to StringRef. This is only allowed when it is known this object is
+  /// representing a StringRef, not a hash code. Calling this function on a hash
+  /// code is considered an error.
+  StringRef stringRef() const {
+    if (Data)
+      return StringRef(Data, LengthOrHashCode);
+    assert(LengthOrHashCode == 0 &&
+           "Cannot convert MD5 FunctionId to StringRef");
+    return StringRef();
+  }
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const FunctionId &Obj);
+
+  /// Get hash code of this object. Returns this object's hash code if it is
+  /// already representing one, otherwise returns the MD5 of its string content.
+  /// Note that it is not the same as std::hash because we want to keep the
+  /// consistency that the same sample profile function in string form or MD5
+  /// form has the same hash code.
+  uint64_t getHashCode() const {
+    if (Data)
+      return MD5Hash(StringRef(Data, LengthOrHashCode));
+    return LengthOrHashCode;
+  }
+
+  bool empty() const { return LengthOrHashCode == 0; }
+
+  /// Check if this object represents a StringRef, or a hash code.
+  bool isStringRef() const { return Data != nullptr; }
+};
+
+inline bool operator==(const FunctionId &LHS, const FunctionId &RHS) {
+  return LHS.equals(RHS);
+}
+
+inline bool operator!=(const FunctionId &LHS, const FunctionId &RHS) {
+  return !LHS.equals(RHS);
+}
+
+inline bool operator<(const FunctionId &LHS, const FunctionId &RHS) {
+  return LHS.compare(RHS) < 0;
+}
+
+inline bool operator<=(const FunctionId &LHS, const FunctionId &RHS) {
+  return LHS.compare(RHS) <= 0;
+}
+
+inline bool operator>(const FunctionId &LHS, const FunctionId &RHS) {
+  return LHS.compare(RHS) > 0;
+}
+
+inline bool operator>=(const FunctionId &LHS, const FunctionId &RHS) {
+  return LHS.compare(RHS) >= 0;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, const FunctionId &Obj) {
+  if (Obj.Data)
+    return OS << StringRef(Obj.Data, Obj.LengthOrHashCode);
+  if (Obj.LengthOrHashCode != 0)
+    return OS << Obj.LengthOrHashCode;
+  return OS;
+}
+
+inline uint64_t MD5Hash(const FunctionId &Obj) {
+  return Obj.getHashCode();
+}
+
+inline uint64_t hash_value(const FunctionId &Obj) {
+  return Obj.getHashCode();
+}
+
+} // end namespace sampleprof
+
+/// Template specialization for FunctionId so that it can be used in LLVM map
+/// containers.
+template <> struct DenseMapInfo<sampleprof::FunctionId, void> {
+
+  static inline sampleprof::FunctionId getEmptyKey() {
+    return sampleprof::FunctionId(~0ULL);
+  }
+
+  static inline sampleprof::FunctionId getTombstoneKey() {
+    return sampleprof::FunctionId(~1ULL);
+  }
+
+  static unsigned getHashValue(const sampleprof::FunctionId &Val) {
+    return Val.getHashCode();
+  }
+
+  static bool isEqual(const sampleprof::FunctionId &LHS,
+                      const sampleprof::FunctionId &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // end namespace llvm
+
+namespace std {
+
+/// Template specialization for FunctionId so that it can be used in STL
+/// containers.
+template <> struct hash<llvm::sampleprof::FunctionId> {
+  size_t operator()(const llvm::sampleprof::FunctionId &Val) const {
+    return Val.getHashCode();
+  }
+};
+
+} // end namespace std
+
+#endif // LLVM_PROFILEDATA_FUNCTIONID_H
diff --git a/llvm/include/llvm/ProfileData/HashKeyMap.h b/llvm/include/llvm/ProfileData/HashKeyMap.h
@@ -0,0 +1,129 @@
+//===--- HashKeyMap.h - Wrapper for maps using hash value key ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// Defines HashKeyMap template.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_HASHKEYMAP_H
+#define LLVM_PROFILEDATA_HASHKEYMAP_H
+
+#include "llvm/ADT/Hashing.h"
+#include <iterator>
+#include <utility>
+
+namespace llvm {
+
+namespace sampleprof {
+
+/// This class is a wrapper to associative container MapT<KeyT, ValueT> using
+/// the hash value of the original key as the new key. This greatly improves the
+/// performance of insert and query operations especially when hash values of
+/// keys are available a priori, and reduces memory usage if KeyT has a large
+/// size.
+/// All keys with the same hash value are considered equivalent (i.e. hash
+/// collision is silently ignored). Given such feature this class should only be
+/// used where it does not affect compilation correctness, for example, when
+/// loading a sample profile. The original key is not stored, so if the user
+/// needs to preserve it, it should be stored in the mapped type.
+/// Assuming the hashing algorithm is uniform, we use the formula
+/// 1 - Permute(n, k) / n ^ k where n is the universe size and k is number of
+/// elements chosen at random to calculate the probability of collision. With
+/// 1,000,000 entries the probability is negligible:
+/// 1 - (2^64)!/((2^64-1000000)!*(2^64)^1000000) ~= 3*10^-8.
+/// Source: https://en.wikipedia.org/wiki/Birthday_problem
+///
+/// \param MapT The underlying associative container type.
+/// \param KeyT The original key type, which requires the implementation of
+///   llvm::hash_value(KeyT).
+/// \param ValueT The original mapped type, which has the same requirement as
+///   the underlying container.
+/// \param MapTArgs Additional template parameters passed to the underlying
+///   container.
+template <template <typename, typename, typename...> typename MapT,
+          typename KeyT, typename ValueT, typename... MapTArgs>
+class HashKeyMap :
+    public MapT<decltype(hash_value(KeyT())), ValueT, MapTArgs...> {
+public:
+  using base_type = MapT<decltype(hash_value(KeyT())), ValueT, MapTArgs...>;
+  using key_type = decltype(hash_value(KeyT()));
+  using original_key_type = KeyT;
+  using mapped_type = ValueT;
+  using value_type = typename base_type::value_type;
+
+  using iterator = typename base_type::iterator;
+  using const_iterator = typename base_type::const_iterator;
+
+  template <typename... Ts>
+  std::pair<iterator, bool> try_emplace(const key_type &Hash,
+                                        const original_key_type &Key,
+                                        Ts &&...Args) {
+    assert(Hash == hash_value(Key));
+    return base_type::try_emplace(Hash, std::forward<Ts>(Args)...);
+  }
+
+  template <typename... Ts>
+  std::pair<iterator, bool> try_emplace(const original_key_type &Key,
+                                        Ts &&...Args) {
+    return try_emplace(hash_value(Key), Key, std::forward<Ts>(Args)...);
+  }
+
+  template <typename... Ts> std::pair<iterator, bool> emplace(Ts &&...Args) {
+    return try_emplace(std::forward<Ts>(Args)...);
+  }
+
+  mapped_type &operator[](const original_key_type &Key) {
+    return try_emplace(Key, mapped_type()).first->second;
+  }
+
+  iterator find(const original_key_type &Key) {
+    auto It = base_type::find(hash_value(Key));
+    if (It != base_type::end())
+      return It;
+    return base_type::end();
+  }
+
+  const_iterator find(const original_key_type &Key) const {
+    auto It = base_type::find(hash_value(Key));
+    if (It != base_type::end())
+      return It;
+    return base_type::end();
+  }
+
+  mapped_type lookup(const original_key_type &Key) const {
+    auto It = base_type::find(hash_value(Key));
+    if (It != base_type::end())
+      return It->second;
+    return mapped_type();
+  }
+
+  size_t count(const original_key_type &Key) const {
+    return base_type::count(hash_value(Key));
+  }
+
+  size_t erase(const original_key_type &Ctx) {
+    auto It = find(Ctx);
+    if (It != base_type::end()) {
+      base_type::erase(It);
+      return 1;
+    }
+    return 0;
+  }
+
+  iterator erase(const_iterator It) {
+    return base_type::erase(It);
+  }
+};
+
+}
+
+}
+
+#endif // LLVM_PROFILEDATA_HASHKEYMAP_H