diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 741c95f3a7d02a..66a6f5c0bac72d 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -831,6 +831,13 @@ option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF) option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON) option (LLVM_ENABLE_BINDINGS "Build bindings." ON) +if(UNIX AND CMAKE_SIZEOF_VOID_P GREATER_EQUAL 8) + set(LLVM_ENABLE_ONDISK_CAS_default ON) +else() + set(LLVM_ENABLE_ONDISK_CAS_default OFF) +endif() +option(LLVM_ENABLE_ONDISK_CAS "Build OnDiskCAS." ${LLVM_ENABLE_ONDISK_CAS_default}) + set(LLVM_INSTALL_DOXYGEN_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/doxygen-html" CACHE STRING "Doxygen-generated HTML documentation install directory") set(LLVM_INSTALL_OCAMLDOC_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/ocaml-html" diff --git a/llvm/docs/ContentAddressableStorage.md b/llvm/docs/ContentAddressableStorage.md new file mode 100644 index 00000000000000..4f2d9a6a3a9185 --- /dev/null +++ b/llvm/docs/ContentAddressableStorage.md @@ -0,0 +1,120 @@ +# Content Addressable Storage + +## Introduction to CAS + +Content Addressable Storage, or `CAS`, is a storage system where it assigns +unique addresses to the data stored. It is very useful for data deduplicaton +and creating unique identifiers. + +Unlikely other kind of storage system like file system, CAS is immutable. It +is more reliable to model a computation when representing the inputs and outputs +of the computation using objects stored in CAS. + +The basic unit of the CAS library is a CASObject, where it contains: + +* Data: arbitrary data +* References: references to other CASObject + +It can be conceptually modeled as something like: + +``` +struct CASObject { + ArrayRef Data; + ArrayRef Refs; +} +``` + +Such abstraction can allow simple composition of CASObjects into a DAG to +represent complicated data structure while still allowing data deduplication. +Note you can compare two DAGs by just comparing the CASObject hash of two +root nodes. + + + +## LLVM CAS Library User Guide + +The CAS-like storage provided in LLVM is `llvm::cas::ObjectStore`. +To reference a CASObject, there are few different abstractions provided +with different trade-offs: + +### ObjectRef + +`ObjectRef` is a lightweight reference to a CASObject stored in the CAS. +This is the most commonly used abstraction and it is cheap to copy/pass +along. It has following properties: + +* `ObjectRef` is only meaningful within the `ObjectStore` that created the ref. +`ObjectRef` created by different `ObjectStore` cannot be cross-referenced or +compared. +* `ObjectRef` doesn't guarantee the existence of the CASObject it points to. An +explicitly load is required before accessing the data stored in CASObject. +This load can also fail, for reasons like but not limited to: object does +not exist, corrupted CAS storage, operation timeout, etc. +* If two `ObjectRef` are equal, it is guarantee that the object they point to +(if exists) are identical. If they are not equal, the underlying objects are +guaranteed to be not the same. + +### ObjectProxy + +`ObjectProxy` represents a loaded CASObject. With an `ObjectProxy`, the +underlying stored data and references can be accessed without the need +of error handling. The class APIs also provide convenient methods to +access underlying data. The lifetime of the underlying data is equal to +the lifetime of the instance of `ObjectStore` unless explicitly copied. + +### CASID + +`CASID` is the hash identifier for CASObjects. It owns the underlying +storage for hash value so it can be expensive to copy and compare depending +on the hash algorithm. `CASID` is generally only useful in rare situations +like printing raw hash value or exchanging hash values between different +CAS instances with the same hashing schema. + +### ObjectStore + +`ObjectStore` is the CAS-like object storage. It provides API to save +and load CASObjects, for example: + +``` +ObjectRef A, B, C; +Expected Stored = ObjectStore.store("data", {A, B}); +Expected Loaded = ObjectStore.getProxy(C); +``` + +It also provides APIs to convert between `ObjectRef`, `ObjectProxy` and +`CASID`. + + + +## CAS Library Implementation Guide + +The LLVM ObjectStore APIs are designed so that it is easy to add +customized CAS implementation that are interchangeable with builtin +CAS implementations. + +To add your own implementation, you just need to add a subclass to +`llvm::cas::ObjectStore` and implement all its pure virtual methods. +To be interchangeable with LLVM ObjectStore, the new CAS implementation +needs to conform to following contracts: + +* Different CASObject stored in the ObjectStore needs to have a different hash +and result in a different `ObjectRef`. Vice versa, same CASObject should have +same hash and same `ObjectRef`. Note two different CASObjects with identical +data but different references are considered different objects. +* `ObjectRef`s are comparable within the same `ObjectStore` instance, and can +be used to determine the equality of the underlying CASObjects. +* The loaded objects from the ObjectStore need to have the lifetime to be at +least as long as the ObjectStore itself. + +If not specified, the behavior can be implementation defined. For example, +`ObjectRef` can be used to point to a loaded CASObject so +`ObjectStore` never fails to load. It is also legal to use a stricter model +than required. For example, an `ObjectRef` that can be used to compare +objects between different `ObjectStore` instances is legal but user +of the ObjectStore should not depend on this behavior. + +For CAS library implementer, there is also a `ObjectHandle` class that +is an internal representation of a loaded CASObject reference. +`ObjectProxy` is just a pair of `ObjectHandle` and `ObjectStore`, because +just like `ObjectRef`, `ObjectHandle` is only useful when paired with +the ObjectStore that knows about the loaded CASObject. diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst index df61628b06c7db..da1afb360ed6de 100644 --- a/llvm/docs/Reference.rst +++ b/llvm/docs/Reference.rst @@ -1,3 +1,4 @@ +<<<<<<< HEAD Reference ========= @@ -15,6 +16,7 @@ LLVM and API reference documentation. BranchWeightMetadata Bugpoint CommandGuide/index + ContentAddressableStorage ConvergenceAndUniformity ConvergentOperations Coroutines @@ -232,3 +234,6 @@ Additional Topics :doc:`ConvergenceAndUniformity` A description of uniformity analysis in the presence of irreducible control flow, and its implementation. + +:doc:`ContentAddressableStorage` + A reference guide for using LLVM's CAS library. diff --git a/llvm/include/llvm-c/CAS/PluginAPI_functions.h b/llvm/include/llvm-c/CAS/PluginAPI_functions.h new file mode 100644 index 00000000000000..8a7c9a64597ca1 --- /dev/null +++ b/llvm/include/llvm-c/CAS/PluginAPI_functions.h @@ -0,0 +1,307 @@ +/*===-- llvm-c/CAS/PluginAPI_functions.h - CAS Plugin Functions Interface -===*\ +|* *| +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM *| +|* Exceptions. *| +|* See https://llvm.org/LICENSE.txt for license information. *| +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* The functions for the LLVM CAS plugin API. Intended for assisting *| +|* implementations of the API. *| +|* The API is experimental and subject to change. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#ifndef LLVM_C_CAS_PLUGINAPI_FUNCTIONS_H +#define LLVM_C_CAS_PLUGINAPI_FUNCTIONS_H + +#include "llvm-c/CAS/PluginAPI_types.h" +#include "llvm-c/ExternC.h" + +#ifndef LLCAS_PUBLIC +#ifdef _WIN32 +#define LLCAS_PUBLIC __declspec(dllexport) +#else +#define LLCAS_PUBLIC +#endif +#endif + +LLVM_C_EXTERN_C_BEGIN + +/** + * Returns the \c LLCAS_VERSION_MAJOR and \c LLCAS_VERSION_MINOR values that the + * plugin was compiled with. + * Intended for assisting compatibility with different versions. + */ +LLCAS_PUBLIC void llcas_get_plugin_version(unsigned *major, unsigned *minor); + +/** + * Releases memory of C string pointers provided by other functions. + */ +LLCAS_PUBLIC void llcas_string_dispose(char *); + +/** + * Options object to configure creation of \c llcas_cas_t. After passing to + * \c llcas_cas_create, its memory can be released via + * \c llcas_cas_options_dispose. + */ +LLCAS_PUBLIC llcas_cas_options_t llcas_cas_options_create(void); + +LLCAS_PUBLIC void llcas_cas_options_dispose(llcas_cas_options_t); + +/** + * Receives the \c LLCAS_VERSION_MAJOR and \c LLCAS_VERSION_MINOR values that + * the client was compiled with. + * Intended for assisting compatibility with different versions. + */ +LLCAS_PUBLIC void llcas_cas_options_set_client_version(llcas_cas_options_t, + unsigned major, + unsigned minor); + +/** + * Receives a local file-system path that the plugin should use for any on-disk + * resources/caches. + */ +LLCAS_PUBLIC void llcas_cas_options_set_ondisk_path(llcas_cas_options_t, + const char *path); + +/** + * Receives a name/value strings pair, for the plugin to set as a custom option + * it supports. These are usually passed through as invocation options and are + * opaque to the client. + * + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns true if there was an error, false otherwise. + */ +LLCAS_PUBLIC bool llcas_cas_options_set_option(llcas_cas_options_t, + const char *name, + const char *value, char **error); + +/** + * Creates a new \c llcas_cas_t object. The objects returned from the other + * functions are only valid to use while the \c llcas_cas_t object that they + * came from is still valid. + * + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns \c NULL if there was an error. + */ +LLCAS_PUBLIC llcas_cas_t llcas_cas_create(llcas_cas_options_t, char **error); + +/** + * Releases memory of \c llcas_cas_t. After calling this it is invalid to keep + * using objects that originated from this \c llcas_cas_t instance. + */ +LLCAS_PUBLIC void llcas_cas_dispose(llcas_cas_t); + +/** + * \returns the hash schema name that the plugin is using. The string memory it + * points to needs to be released via \c llcas_string_dispose. + */ +LLCAS_PUBLIC char *llcas_cas_get_hash_schema_name(llcas_cas_t); + +/** + * Parses the printed digest and returns the digest hash bytes. + * + * \param printed_digest a C string that was previously provided by + * \c llcas_digest_print. + * \param bytes pointer to a buffer for writing the digest bytes. Can be \c NULL + * if \p bytes_size is 0. + * \param bytes_size the size of the buffer. + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns 0 if there was an error. If \p bytes_size is smaller than the + * required size to fit the digest bytes, returns the required buffer size + * without writing to \c bytes. Otherwise writes the digest bytes to \p bytes + * and returns the number of written bytes. + */ +LLCAS_PUBLIC unsigned llcas_digest_parse(llcas_cas_t, + const char *printed_digest, + uint8_t *bytes, size_t bytes_size, + char **error); + +/** + * Returns a string for the given digest bytes that can be passed to + * \c llcas_digest_parse. + * + * \param printed_id pointer to receive the printed digest string. The memory it + * points to needs to be released via \c llcas_string_dispose. + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns true if there was an error, false otherwise. + */ +LLCAS_PUBLIC bool llcas_digest_print(llcas_cas_t, llcas_digest_t, + char **printed_id, char **error); + +/** + * Provides the \c llcas_objectid_t value for the given \c llcas_digest_t. + * + * \param digest the digest bytes that the returned \c llcas_objectid_t + * represents. + * \param p_id pointer to store the returned \c llcas_objectid_t object. + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns true if there was an error, false otherwise. + */ +LLCAS_PUBLIC bool llcas_cas_get_objectid(llcas_cas_t, llcas_digest_t digest, + llcas_objectid_t *p_id, char **error); + +/** + * \returns the \c llcas_digest_t value for the given \c llcas_objectid_t. + * The memory that the buffer points to is valid for the lifetime of the + * \c llcas_cas_t object. + */ +LLCAS_PUBLIC llcas_digest_t llcas_objectid_get_digest(llcas_cas_t, + llcas_objectid_t); + +/** + * Checks whether a \c llcas_objectid_t points to an existing object. + * + * \param globally For CAS implementations that distinguish between local CAS + * and remote/distributed CAS, \p globally set to false indicates that the + * lookup will be restricted to the local CAS, returning "not found" even if the + * object might exist in the remote CAS. + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns one of \c llcas_lookup_result_t. + */ +LLCAS_PUBLIC llcas_lookup_result_t llcas_cas_contains_object(llcas_cas_t, + llcas_objectid_t, + bool globally, + char **error); + +/** + * Loads the object that \c llcas_objectid_t points to. + * + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns one of \c llcas_lookup_result_t. + */ +LLCAS_PUBLIC llcas_lookup_result_t llcas_cas_load_object( + llcas_cas_t, llcas_objectid_t, llcas_loaded_object_t *, char **error); + +/** + * Like \c llcas_cas_load_object but loading happens via a callback function. + * Whether the call is asynchronous or not depends on the implementation. + * + * \param ctx_cb pointer to pass to the callback function. + */ +LLCAS_PUBLIC void llcas_cas_load_object_async(llcas_cas_t, llcas_objectid_t, + void *ctx_cb, + llcas_cas_load_object_cb); + +/** + * Stores the object with the provided data buffer and \c llcas_objectid_t + * references, and provides its associated \c llcas_objectid_t. + * + * \param refs pointer to array of \c llcas_objectid_t. Can be \c NULL if + * \p refs_count is 0. + * \param refs_count number of \c llcas_objectid_t objects in the array. + * \param p_id pointer to store the returned \c llcas_objectid_t object. + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns true if there was an error, false otherwise. + */ +LLCAS_PUBLIC bool llcas_cas_store_object(llcas_cas_t, llcas_data_t, + const llcas_objectid_t *refs, + size_t refs_count, + llcas_objectid_t *p_id, char **error); + +/** + * \returns the data buffer of the provided \c llcas_loaded_object_t. The buffer + * pointer must be 8-byte aligned and \c NULL terminated. The memory that the + * buffer points to is valid for the lifetime of the \c llcas_cas_t object. + */ +LLCAS_PUBLIC llcas_data_t llcas_loaded_object_get_data(llcas_cas_t, + llcas_loaded_object_t); + +/** + * \returns the references of the provided \c llcas_loaded_object_t. + */ +LLCAS_PUBLIC llcas_object_refs_t + llcas_loaded_object_get_refs(llcas_cas_t, llcas_loaded_object_t); + +/** + * \returns the number of references in the provided \c llcas_object_refs_t. + */ +LLCAS_PUBLIC size_t llcas_object_refs_get_count(llcas_cas_t, + llcas_object_refs_t); + +/** + * \returns the \c llcas_objectid_t of the reference at \p index. It is invalid + * to pass an index that is out of the range of references. + */ +LLCAS_PUBLIC llcas_objectid_t llcas_object_refs_get_id(llcas_cas_t, + llcas_object_refs_t, + size_t index); + +/** + * Retrieves the \c llcas_objectid_t value associated with a \p key. + * + * \param p_value pointer to store the returned \c llcas_objectid_t object. + * \param globally if true it is a hint to the underlying implementation that + * the lookup is profitable to be done on a distributed caching level, not just + * locally. The implementation is free to ignore this flag. + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns one of \c llcas_lookup_result_t. + */ +LLCAS_PUBLIC llcas_lookup_result_t llcas_actioncache_get_for_digest( + llcas_cas_t, llcas_digest_t key, llcas_objectid_t *p_value, bool globally, + char **error); + +/** + * Like \c llcas_actioncache_get_for_digest but result is provided to a callback + * function. Whether the call is asynchronous or not depends on the + * implementation. + * + * \param ctx_cb pointer to pass to the callback function. + */ +LLCAS_PUBLIC void +llcas_actioncache_get_for_digest_async(llcas_cas_t, llcas_digest_t key, + bool globally, void *ctx_cb, + llcas_actioncache_get_cb); + +/** + * Associates a \c llcas_objectid_t \p value with a \p key. It is invalid to set + * a different \p value to the same \p key. + * + * \param globally if true it is a hint to the underlying implementation that + * the association is profitable to be done on a distributed caching level, not + * just locally. The implementation is free to ignore this flag. + * \param error optional pointer to receive an error message if an error + * occurred. If set, the memory it points to needs to be released via + * \c llcas_string_dispose. + * \returns true if there was an error, false otherwise. + */ +LLCAS_PUBLIC bool llcas_actioncache_put_for_digest(llcas_cas_t, + llcas_digest_t key, + llcas_objectid_t value, + bool globally, char **error); + +/** + * Like \c llcas_actioncache_put_for_digest but result is provided to a callback + * function. Whether the call is asynchronous or not depends on the + * implementation. + * + * \param ctx_cb pointer to pass to the callback function. + */ +LLCAS_PUBLIC void +llcas_actioncache_put_for_digest_async(llcas_cas_t, llcas_digest_t key, + llcas_objectid_t value, bool globally, + void *ctx_cb, llcas_actioncache_put_cb); + +LLVM_C_EXTERN_C_END + +#endif /* LLVM_C_CAS_PLUGINAPI_FUNCTIONS_H */ diff --git a/llvm/include/llvm-c/CAS/PluginAPI_types.h b/llvm/include/llvm-c/CAS/PluginAPI_types.h new file mode 100644 index 00000000000000..fdade74fcebcc6 --- /dev/null +++ b/llvm/include/llvm-c/CAS/PluginAPI_types.h @@ -0,0 +1,118 @@ +/*===-- llvm-c/CAS/PluginAPI_Types.h - CAS Plugin Types Interface -*- C -*-===*\ +|* *| +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM *| +|* Exceptions. *| +|* See https://llvm.org/LICENSE.txt for license information. *| +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* The types for the LLVM CAS plugin API. *| +|* The API is experimental and subject to change. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#ifndef LLVM_C_CAS_PLUGINAPI_TYPES_H +#define LLVM_C_CAS_PLUGINAPI_TYPES_H + +#include +#include +#include + +#define LLCAS_VERSION_MAJOR 0 +#define LLCAS_VERSION_MINOR 1 + +typedef struct llcas_cas_options_s *llcas_cas_options_t; +typedef struct llcas_cas_s *llcas_cas_t; + +/** + * Digest hash bytes. + */ +typedef struct { + const uint8_t *data; + size_t size; +} llcas_digest_t; + +/** + * Data buffer for stored CAS objects. + */ +typedef struct { + const void *data; + size_t size; +} llcas_data_t; + +/** + * Identifier for a CAS object. + */ +typedef struct { + uint64_t opaque; +} llcas_objectid_t; + +/** + * A loaded CAS object. + */ +typedef struct { + uint64_t opaque; +} llcas_loaded_object_t; + +/** + * Object references for a CAS object. + */ +typedef struct { + uint64_t opaque_b; + uint64_t opaque_e; +} llcas_object_refs_t; + +/** + * Return values for a load operation. + */ +typedef enum { + /** + * The object was found. + */ + LLCAS_LOOKUP_RESULT_SUCCESS = 0, + + /** + * The object was not found. + */ + LLCAS_LOOKUP_RESULT_NOTFOUND = 1, + + /** + * An error occurred. + */ + LLCAS_LOOKUP_RESULT_ERROR = 2, +} llcas_lookup_result_t; + +/** + * Callback for \c llcas_cas_load_object_async. + * + * \param ctx pointer passed through from the \c llcas_cas_load_object_async + * call. + * \param error message if an error occurred. If set, the memory it points to + * needs to be released via \c llcas_string_dispose. + */ +typedef void (*llcas_cas_load_object_cb)(void *ctx, llcas_lookup_result_t, + llcas_loaded_object_t, char *error); + +/** + * Callback for \c llcas_actioncache_get_for_digest_async. + * + * \param ctx pointer passed through from the + * \c llcas_actioncache_get_for_digest_async call. + * \param error message if an error occurred. If set, the memory it points to + * needs to be released via \c llcas_string_dispose. + */ +typedef void (*llcas_actioncache_get_cb)(void *ctx, llcas_lookup_result_t, + llcas_objectid_t, char *error); + +/** + * Callback for \c llcas_actioncache_put_for_digest_async. + * + * \param ctx pointer passed through from the + * \c llcas_actioncache_put_for_digest_async call. + * \param error message if an error occurred. If set, the memory it points to + * needs to be released via \c llcas_string_dispose. + */ +typedef void (*llcas_actioncache_put_cb)(void *ctx, bool failed, char *error); + +#endif /* LLVM_C_CAS_PLUGINAPI_TYPES_H */ diff --git a/llvm/include/llvm/ADT/TrieRawHashMap.h b/llvm/include/llvm/ADT/TrieRawHashMap.h new file mode 100644 index 00000000000000..baa08e214ce6fd --- /dev/null +++ b/llvm/include/llvm/ADT/TrieRawHashMap.h @@ -0,0 +1,398 @@ +//===- TrieRawHashMap.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ADT_TRIERAWHASHMAP_H +#define LLVM_ADT_TRIERAWHASHMAP_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" +#include +#include + +namespace llvm { + +class raw_ostream; + +/// TrieRawHashMap - is a lock-free thread-safe trie that is can be used to +/// store/index data based on a hash value. It can be customized to work with +/// any hash algorithm or store any data. +/// +/// Data structure: +/// Data node stored in the Trie contains both hash and data: +/// struct { +/// HashT Hash; +/// DataT Data; +/// }; +/// +/// Data is stored/indexed via a prefix tree, where each node in the tree can be +/// either the root, a sub-trie or a data node. Assuming a 4-bit hash and two +/// data objects {0001, A} and {0100, B}, it can be stored in a trie +/// (assuming Root has 2 bits, SubTrie has 1 bit): +/// +--------+ +/// |Root[00]| -> {0001, A} +/// | [01]| -> {0100, B} +/// | [10]| (empty) +/// | [11]| (empty) +/// +--------+ +/// +/// Inserting a new object {0010, C} will result in: +/// +--------+ +----------+ +/// |Root[00]| -> |SubTrie[0]| -> {0001, A} +/// | | | [1]| -> {0010, C} +/// | | +----------+ +/// | [01]| -> {0100, B} +/// | [10]| (empty) +/// | [11]| (empty) +/// +--------+ +/// Note object A is sunk down to a sub-trie during the insertion. All the +/// nodes are inserted through compare-exchange to ensure thread-safe and +/// lock-free. +/// +/// To find an object in the trie, walk the tree with prefix of the hash until +/// the data node is found. Then the hash is compared with the hash stored in +/// the data node to see if the is the same object. +/// +/// Hash collision is not allowed so it is recommended to use trie with a +/// "strong" hashing algorithm. A well-distributed hash can also result in +/// better performance and memory usage. +/// +/// It currently does not support iteration and deletion. + +/// Base class for a lock-free thread-safe hash-mapped trie. +class ThreadSafeTrieRawHashMapBase { +public: + static constexpr size_t TrieContentBaseSize = 4; + static constexpr size_t DefaultNumRootBits = 6; + static constexpr size_t DefaultNumSubtrieBits = 4; + +private: + template struct AllocValueType { + char Base[TrieContentBaseSize]; + std::aligned_union_t Content; + }; + +protected: + template + static constexpr size_t DefaultContentAllocSize = sizeof(AllocValueType); + + template + static constexpr size_t DefaultContentAllocAlign = alignof(AllocValueType); + + template + static constexpr size_t DefaultContentOffset = + offsetof(AllocValueType, Content); + +public: + void operator delete(void *Ptr) { ::free(Ptr); } + + LLVM_DUMP_METHOD void dump() const; + void print(raw_ostream &OS) const; + +protected: + /// Result of a lookup. Suitable for an insertion hint. Maybe could be + /// expanded into an iterator of sorts, but likely not useful (visiting + /// everything in the trie should probably be done some way other than + /// through an iterator pattern). + class PointerBase { + protected: + void *get() const { return I == -2u ? P : nullptr; } + + public: + PointerBase() noexcept = default; + PointerBase(PointerBase &&) = default; + PointerBase(const PointerBase &) = default; + PointerBase &operator=(PointerBase &&) = default; + PointerBase &operator=(const PointerBase &) = default; + + private: + friend class ThreadSafeTrieRawHashMapBase; + explicit PointerBase(void *Content) : P(Content), I(-2u) {} + PointerBase(void *P, unsigned I, unsigned B) : P(P), I(I), B(B) {} + + bool isHint() const { return I != -1u && I != -2u; } + + void *P = nullptr; + unsigned I = -1u; + unsigned B = 0; + }; + + /// Find the stored content with hash. + PointerBase find(ArrayRef Hash) const; + + /// Insert and return the stored content. + PointerBase + insert(PointerBase Hint, ArrayRef Hash, + function_ref Hash)> + Constructor); + + ThreadSafeTrieRawHashMapBase() = delete; + + ThreadSafeTrieRawHashMapBase( + size_t ContentAllocSize, size_t ContentAllocAlign, size_t ContentOffset, + std::optional NumRootBits = std::nullopt, + std::optional NumSubtrieBits = std::nullopt); + + /// Destructor, which asserts if there's anything to do. Subclasses should + /// call \a destroyImpl(). + /// + /// \pre \a destroyImpl() was already called. + ~ThreadSafeTrieRawHashMapBase(); + void destroyImpl(function_ref Destructor); + + ThreadSafeTrieRawHashMapBase(ThreadSafeTrieRawHashMapBase &&RHS); + + // Move assignment can be implemented in a thread-safe way if NumRootBits and + // NumSubtrieBits are stored inside the Root. + ThreadSafeTrieRawHashMapBase & + operator=(ThreadSafeTrieRawHashMapBase &&RHS) = delete; + + // No copy. + ThreadSafeTrieRawHashMapBase(const ThreadSafeTrieRawHashMapBase &) = delete; + ThreadSafeTrieRawHashMapBase & + operator=(const ThreadSafeTrieRawHashMapBase &) = delete; + + // Debug functions. Implementation details and not guaranteed to be + // thread-safe. + PointerBase getRoot() const; + unsigned getStartBit(PointerBase P) const; + unsigned getNumBits(PointerBase P) const; + unsigned getNumSlotUsed(PointerBase P) const; + std::string getTriePrefixAsString(PointerBase P) const; + unsigned getNumTries() const; + // Visit next trie in the allocation chain. + PointerBase getNextTrie(PointerBase P) const; + +private: + friend class TrieRawHashMapTestHelper; + const unsigned short ContentAllocSize; + const unsigned short ContentAllocAlign; + const unsigned short ContentOffset; + unsigned short NumRootBits; + unsigned short NumSubtrieBits; + struct ImplType; + // ImplPtr is owned by ThreadSafeTrieRawHashMapBase and needs to be freed in + // destoryImpl. + std::atomic ImplPtr; + ImplType &getOrCreateImpl(); + ImplType *getImpl() const; +}; + +/// Lock-free thread-safe hash-mapped trie. +template +class ThreadSafeTrieRawHashMap : public ThreadSafeTrieRawHashMapBase { +public: + using HashT = std::array; + + class LazyValueConstructor; + struct value_type { + const HashT Hash; + T Data; + + value_type(value_type &&) = default; + value_type(const value_type &) = default; + + value_type(ArrayRef Hash, const T &Data) + : Hash(makeHash(Hash)), Data(Data) {} + value_type(ArrayRef Hash, T &&Data) + : Hash(makeHash(Hash)), Data(std::move(Data)) {} + + private: + friend class LazyValueConstructor; + + struct EmplaceTag {}; + template + value_type(ArrayRef Hash, EmplaceTag, ArgsT &&...Args) + : Hash(makeHash(Hash)), Data(std::forward(Args)...) {} + + static HashT makeHash(ArrayRef HashRef) { + HashT Hash; + std::copy(HashRef.begin(), HashRef.end(), Hash.data()); + return Hash; + } + }; + + using ThreadSafeTrieRawHashMapBase::operator delete; + using HashType = HashT; + + using ThreadSafeTrieRawHashMapBase::dump; + using ThreadSafeTrieRawHashMapBase::print; + +private: + template class PointerImpl : PointerBase { + friend class ThreadSafeTrieRawHashMap; + + ValueT *get() const { + if (void *B = PointerBase::get()) + return reinterpret_cast(B); + return nullptr; + } + + public: + ValueT &operator*() const { + assert(get()); + return *get(); + } + ValueT *operator->() const { + assert(get()); + return get(); + } + explicit operator bool() const { return get(); } + + PointerImpl() = default; + PointerImpl(PointerImpl &&) = default; + PointerImpl(const PointerImpl &) = default; + PointerImpl &operator=(PointerImpl &&) = default; + PointerImpl &operator=(const PointerImpl &) = default; + + protected: + PointerImpl(PointerBase Result) : PointerBase(Result) {} + }; + +public: + class pointer; + class const_pointer; + class pointer : public PointerImpl { + friend class ThreadSafeTrieRawHashMap; + friend class const_pointer; + + public: + pointer() = default; + pointer(pointer &&) = default; + pointer(const pointer &) = default; + pointer &operator=(pointer &&) = default; + pointer &operator=(const pointer &) = default; + + private: + pointer(PointerBase Result) : pointer::PointerImpl(Result) {} + }; + + class const_pointer : public PointerImpl { + friend class ThreadSafeTrieRawHashMap; + + public: + const_pointer() = default; + const_pointer(const_pointer &&) = default; + const_pointer(const const_pointer &) = default; + const_pointer &operator=(const_pointer &&) = default; + const_pointer &operator=(const const_pointer &) = default; + + const_pointer(const pointer &P) : const_pointer::PointerImpl(P) {} + + private: + const_pointer(PointerBase Result) : const_pointer::PointerImpl(Result) {} + }; + + class LazyValueConstructor { + public: + value_type &operator()(T &&RHS) { + assert(Mem && "Constructor already called, or moved away"); + return assign(::new (Mem) value_type(Hash, std::move(RHS))); + } + value_type &operator()(const T &RHS) { + assert(Mem && "Constructor already called, or moved away"); + return assign(::new (Mem) value_type(Hash, RHS)); + } + template value_type &emplace(ArgsT &&...Args) { + assert(Mem && "Constructor already called, or moved away"); + return assign(::new (Mem) + value_type(Hash, typename value_type::EmplaceTag{}, + std::forward(Args)...)); + } + + LazyValueConstructor(LazyValueConstructor &&RHS) + : Mem(RHS.Mem), Result(RHS.Result), Hash(RHS.Hash) { + RHS.Mem = nullptr; // Moved away, cannot call. + } + ~LazyValueConstructor() { assert(!Mem && "Constructor never called!"); } + + private: + value_type &assign(value_type *V) { + Mem = nullptr; + Result = V; + return *V; + } + friend class ThreadSafeTrieRawHashMap; + LazyValueConstructor() = delete; + LazyValueConstructor(void *Mem, value_type *&Result, ArrayRef Hash) + : Mem(Mem), Result(Result), Hash(Hash) { + assert(Hash.size() == sizeof(HashT) && "Invalid hash"); + assert(Mem && "Invalid memory for construction"); + } + void *Mem; + value_type *&Result; + ArrayRef Hash; + }; + + /// Insert with a hint. Default-constructed hint will work, but it's + /// recommended to start with a lookup to avoid overhead in object creation + /// if it already exists. + pointer insertLazy(const_pointer Hint, ArrayRef Hash, + function_ref OnConstruct) { + return pointer(ThreadSafeTrieRawHashMapBase::insert( + Hint, Hash, [&](void *Mem, ArrayRef Hash) { + value_type *Result = nullptr; + OnConstruct(LazyValueConstructor(Mem, Result, Hash)); + return Result->Hash.data(); + })); + } + + pointer insertLazy(ArrayRef Hash, + function_ref OnConstruct) { + return insertLazy(const_pointer(), Hash, OnConstruct); + } + + pointer insert(const_pointer Hint, value_type &&HashedData) { + return insertLazy(Hint, HashedData.Hash, [&](LazyValueConstructor C) { + C(std::move(HashedData.Data)); + }); + } + + pointer insert(const_pointer Hint, const value_type &HashedData) { + return insertLazy(Hint, HashedData.Hash, + [&](LazyValueConstructor C) { C(HashedData.Data); }); + } + + pointer find(ArrayRef Hash) { + assert(Hash.size() == std::tuple_size::value); + return ThreadSafeTrieRawHashMapBase::find(Hash); + } + + const_pointer find(ArrayRef Hash) const { + assert(Hash.size() == std::tuple_size::value); + return ThreadSafeTrieRawHashMapBase::find(Hash); + } + + ThreadSafeTrieRawHashMap(std::optional NumRootBits = std::nullopt, + std::optional NumSubtrieBits = std::nullopt) + : ThreadSafeTrieRawHashMapBase(DefaultContentAllocSize, + DefaultContentAllocAlign, + DefaultContentOffset, + NumRootBits, NumSubtrieBits) {} + + ~ThreadSafeTrieRawHashMap() { + if constexpr (std::is_trivially_destructible::value) + this->destroyImpl(nullptr); + else + this->destroyImpl( + [](void *P) { static_cast(P)->~value_type(); }); + } + + // Move constructor okay. + ThreadSafeTrieRawHashMap(ThreadSafeTrieRawHashMap &&) = default; + + // No move assignment or any copy. + ThreadSafeTrieRawHashMap &operator=(ThreadSafeTrieRawHashMap &&) = delete; + ThreadSafeTrieRawHashMap(const ThreadSafeTrieRawHashMap &) = delete; + ThreadSafeTrieRawHashMap & + operator=(const ThreadSafeTrieRawHashMap &) = delete; +}; + +} // namespace llvm + +#endif // LLVM_ADT_TRIERAWHASHMAP_H diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h new file mode 100644 index 00000000000000..9abefb1876265a --- /dev/null +++ b/llvm/include/llvm/CAS/ActionCache.h @@ -0,0 +1,154 @@ +//===- llvm/CAS/ActionCache.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_CASACTIONCACHE_H +#define LLVM_CAS_CASACTIONCACHE_H + +#include "llvm/ADT/FunctionExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/CASID.h" +#include "llvm/CAS/CASReference.h" +#include "llvm/Support/Error.h" +#include + +namespace llvm::cas { + +class ObjectStore; +class CASID; +class ObjectProxy; + +/// A key for caching an operation. +/// It is implemented as a bag of bytes and provides a convenient constructor +/// for CAS types. +class CacheKey { +public: + StringRef getKey() const { return Key; } + + // TODO: Support CacheKey other than a CASID but rather any array of bytes. + // To do that, ActionCache need to be able to rehash the key into the index, + // which then `getOrCompute` method can be used to avoid multiple calls to + // has function. + CacheKey(const CASID &ID); + CacheKey(const ObjectProxy &Proxy); + CacheKey(const ObjectStore &CAS, const ObjectRef &Ref); + +private: + std::string Key; +}; + +using AsyncCASIDValue = AsyncValue; + +/// This is used to workaround the issue of MSVC needing default-constructible +/// types for \c std::promise/future. +struct AsyncErrorValue { + Error take() { return std::move(Value); } + + AsyncErrorValue() : Value(Error::success()) {} + AsyncErrorValue(Error &&E) : Value(std::move(E)) {} + +private: + Error Value; +}; + +/// A cache from a key describing an action to the result of doing it. +/// +/// Actions are expected to be pure (collision is an error). +class ActionCache { + virtual void anchor(); + +public: + /// Get a previously computed result for \p ActionKey. + /// + /// \param Globally if true it is a hint to the underlying implementation that + /// the lookup is profitable to be done on a distributed caching level, not + /// just locally. The implementation is free to ignore this flag. + Expected> get(const CacheKey &ActionKey, + bool Globally = false) const { + return getImpl(arrayRefFromStringRef(ActionKey.getKey()), Globally); + } + + /// Cache \p Result for the \p ActionKey computation. + /// + /// \param Globally if true it is a hint to the underlying implementation that + /// the association is profitable to be done on a distributed caching level, + /// not just locally. The implementation is free to ignore this flag. + Error put(const CacheKey &ActionKey, const CASID &Result, + bool Globally = false) { + assert(Result.getContext().getHashSchemaIdentifier() == + getContext().getHashSchemaIdentifier() && + "Hash schema mismatch"); + return putImpl(arrayRefFromStringRef(ActionKey.getKey()), Result, Globally); + } + +#ifndef _MSC_VER + /// FIXME: MSVC doesn't compile Error within Promise/Future correctly and will + /// result in unchecked error. Disable AsyncAPIs when using MSVC for now. + + /// Asynchronous version of \c get. + std::future getFuture(const CacheKey &ActionKey, + bool Globally = false) const; + + /// Asynchronous version of \c get. + void getAsync( + const CacheKey &ActionKey, bool Globally, + unique_function>)> Callback) const { + return getImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Globally, + std::move(Callback)); + } + + /// Asynchronous version of \c put. + std::future putFuture(const CacheKey &ActionKey, + const CASID &Result, + bool Globally = false); + + /// Asynchronous version of \c put. + void putAsync(const CacheKey &ActionKey, const CASID &Result, bool Globally, + unique_function Callback) { + assert(Result.getContext().getHashSchemaIdentifier() == + getContext().getHashSchemaIdentifier() && + "Hash schema mismatch"); + return putImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Result, + Globally, std::move(Callback)); + } +#endif + + virtual ~ActionCache() = default; + +protected: + virtual Expected> getImpl(ArrayRef ResolvedKey, + bool Globally) const = 0; + virtual void getImplAsync( + ArrayRef ResolvedKey, bool Globally, + unique_function>)> Callback) const; + + virtual Error putImpl(ArrayRef ResolvedKey, const CASID &Result, + bool Globally) = 0; + virtual void putImplAsync(ArrayRef ResolvedKey, const CASID &Result, + bool Globally, + unique_function Callback); + + ActionCache(const CASContext &Context) : Context(Context) {} + + const CASContext &getContext() const { return Context; } + +private: + const CASContext &Context; +}; + +/// Create an action cache in memory. +std::unique_ptr createInMemoryActionCache(); + +/// Get a reasonable default on-disk path for a persistent ActionCache for the +/// current user. +std::string getDefaultOnDiskActionCachePath(); + +/// Create an action cache on disk. +Expected> createOnDiskActionCache(StringRef Path); +} // end namespace llvm::cas + +#endif // LLVM_CAS_CASACTIONCACHE_H diff --git a/llvm/include/llvm/CAS/BuiltinCASContext.h b/llvm/include/llvm/CAS/BuiltinCASContext.h new file mode 100644 index 00000000000000..ebc4ca8bd1f2e9 --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinCASContext.h @@ -0,0 +1,88 @@ +//===- BuiltinCASContext.h --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINCASCONTEXT_H +#define LLVM_CAS_BUILTINCASCONTEXT_H + +#include "llvm/CAS/CASID.h" +#include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Error.h" + +namespace llvm::cas::builtin { + +/// Current hash type for the builtin CAS. +/// +/// FIXME: This should be configurable via an enum to allow configuring the hash +/// function. The enum should be sent into \a createInMemoryCAS() and \a +/// createOnDiskCAS(). +/// +/// This is important (at least) for future-proofing, when we want to make new +/// CAS instances use BLAKE7, but still know how to read/write BLAKE3. +/// +/// Even just for BLAKE3, it would be useful to have these values: +/// +/// BLAKE3 => 32B hash from BLAKE3 +/// BLAKE3_16B => 16B hash from BLAKE3 (truncated) +/// +/// ... where BLAKE3_16 uses \a TruncatedBLAKE3<16>. +/// +/// Motivation for a truncated hash is that it's cheaper to store. It's not +/// clear if we always (or ever) need the full 32B, and for an ephemeral +/// in-memory CAS, we almost certainly don't need it. +/// +/// Note that the cost is linear in the number of objects for the builtin CAS, +/// since we're using internal offsets and/or pointers as an optimization. +/// +/// However, it's possible we'll want to hook up a local builtin CAS to, e.g., +/// a distributed generic hash map to use as an ActionCache. In that scenario, +/// the transitive closure of the structured objects that are the results of +/// the cached actions would need to be serialized into the map, something +/// like: +/// +/// "action::" -> "0123" +/// "object::0123" -> "3,4567,89AB,CDEF,9,some data" +/// "object::4567" -> ... +/// "object::89AB" -> ... +/// "object::CDEF" -> ... +/// +/// These references would be full cost. +using HasherT = BLAKE3; +using HashType = decltype(HasherT::hash(std::declval &>())); + +class BuiltinCASContext : public CASContext { + void printIDImpl(raw_ostream &OS, const CASID &ID) const final; + void anchor() override; + +public: + /// Get the name of the hash for any table identifiers. + /// + /// FIXME: This should be configurable via an enum, with at the following + /// values: + /// + /// "BLAKE3" => 32B hash from BLAKE3 + /// "BLAKE3.16" => 16B hash from BLAKE3 (truncated) + /// + /// Enum can be sent into \a createInMemoryCAS() and \a createOnDiskCAS(). + static StringRef getHashName() { return "BLAKE3"; } + StringRef getHashSchemaIdentifier() const final { + static const std::string ID = + ("llvm.cas.builtin.v2[" + getHashName() + "]").str(); + return ID; + } + + static const BuiltinCASContext &getDefaultContext(); + + BuiltinCASContext() = default; + + static Expected parseID(StringRef PrintedDigest); + static void printID(ArrayRef Digest, raw_ostream &OS); +}; + +} // namespace llvm::cas::builtin + +#endif // LLVM_CAS_BUILTINCASCONTEXT_H diff --git a/llvm/include/llvm/CAS/BuiltinObjectHasher.h b/llvm/include/llvm/CAS/BuiltinObjectHasher.h new file mode 100644 index 00000000000000..24616af5e10530 --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinObjectHasher.h @@ -0,0 +1,82 @@ +//===- BuiltinObjectHasher.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINOBJECTHASHER_H +#define LLVM_CAS_BUILTINOBJECTHASHER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/Endian.h" + +namespace llvm::cas { + +template class BuiltinObjectHasher { +public: + using HashT = decltype(HasherT::hash(std::declval &>())); + + static HashT hashObject(const ObjectStore &CAS, ArrayRef Refs, + ArrayRef Data) { + BuiltinObjectHasher H; + H.updateSize(Refs.size()); + for (const ObjectRef &Ref : Refs) + H.updateRef(CAS, Ref); + H.updateArray(Data); + return H.finish(); + } + + static HashT hashObject(ArrayRef> Refs, + ArrayRef Data) { + BuiltinObjectHasher H; + H.updateSize(Refs.size()); + for (const ArrayRef &Ref : Refs) + H.updateID(Ref); + H.updateArray(Data); + return H.finish(); + } + +private: + HashT finish() { return Hasher.final(); } + + void updateRef(const ObjectStore &CAS, ObjectRef Ref) { + updateID(CAS.getID(Ref)); + } + + void updateID(const CASID &ID) { updateID(ID.getHash()); } + + void updateID(ArrayRef Hash) { + // NOTE: Does not hash the size of the hash. That's a CAS implementation + // detail that shouldn't leak into the UUID for an object. + assert(Hash.size() == sizeof(HashT) && + "Expected object ref to match the hash size"); + Hasher.update(Hash); + } + + void updateArray(ArrayRef Bytes) { + updateSize(Bytes.size()); + Hasher.update(Bytes); + } + + void updateArray(ArrayRef Bytes) { + updateArray(ArrayRef(reinterpret_cast(Bytes.data()), + Bytes.size())); + } + + void updateSize(uint64_t Size) { + Size = support::endian::byte_swap(Size, endianness::little); + Hasher.update( + ArrayRef(reinterpret_cast(&Size), sizeof(Size))); + } + + BuiltinObjectHasher() = default; + ~BuiltinObjectHasher() = default; + HasherT Hasher; +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_BUILTINOBJECTHASHER_H diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h new file mode 100644 index 00000000000000..969d097b6cecac --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h @@ -0,0 +1,26 @@ +//===- BuiltinUnifiedCASDatabases.h -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H +#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H + +#include "llvm/Support/Error.h" + +namespace llvm::cas { + +class ActionCache; +class ObjectStore; + +/// Create on-disk \c ObjectStore and \c ActionCache instances based on +/// \c ondisk::UnifiedOnDiskCache, with built-in hashing. +Expected, std::unique_ptr>> +createOnDiskUnifiedCASDatabases(StringRef Path); + +} // namespace llvm::cas + +#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H diff --git a/llvm/include/llvm/CAS/CASID.h b/llvm/include/llvm/CAS/CASID.h new file mode 100644 index 00000000000000..5f9110a15819ad --- /dev/null +++ b/llvm/include/llvm/CAS/CASID.h @@ -0,0 +1,156 @@ +//===- llvm/CAS/CASID.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_CASID_H +#define LLVM_CAS_CASID_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +namespace llvm { + +class raw_ostream; + +namespace cas { + +class CASID; + +/// Context for CAS identifiers. +class CASContext { + virtual void anchor(); + +public: + virtual ~CASContext() = default; + + /// Get an identifer for the schema used by this CAS context. Two CAS + /// instances should return \c true for this identifier if and only if their + /// CASIDs are safe to compare by hash. This is used by \a + /// CASID::equalsImpl(). + virtual StringRef getHashSchemaIdentifier() const = 0; + +protected: + /// Print \p ID to \p OS. + virtual void printIDImpl(raw_ostream &OS, const CASID &ID) const = 0; + + friend class CASID; +}; + +/// Unique identifier for a CAS object. +/// +/// Locally, stores an internal CAS identifier that's specific to a single CAS +/// instance. It's guaranteed not to change across the view of that CAS, but +/// might change between runs. +/// +/// It also has \a CASIDContext pointer to allow comparison of these +/// identifiers. If two CASIDs are from the same CASIDContext, they can be +/// compared directly. If they are, then \a +/// CASIDContext::getHashSchemaIdentifier() is compared to see if they can be +/// compared by hash, in which case the result of \a getHash() is compared. +class CASID { +public: + void dump() const; + void print(raw_ostream &OS) const { + return getContext().printIDImpl(OS, *this); + } + friend raw_ostream &operator<<(raw_ostream &OS, const CASID &ID) { + ID.print(OS); + return OS; + } + std::string toString() const; + + ArrayRef getHash() const { + return arrayRefFromStringRef(Hash); + } + + friend bool operator==(const CASID &LHS, const CASID &RHS) { + if (LHS.Context == RHS.Context) + return LHS.Hash == RHS.Hash; + + // EmptyKey or TombstoneKey. + if (!LHS.Context || !RHS.Context) + return false; + + // CASIDs are equal when they have the same hash schema and same hash value. + return LHS.Context->getHashSchemaIdentifier() == + RHS.Context->getHashSchemaIdentifier() && + LHS.Hash == RHS.Hash; + } + + friend bool operator!=(const CASID &LHS, const CASID &RHS) { + return !(LHS == RHS); + } + + friend hash_code hash_value(const CASID &ID) { + ArrayRef Hash = ID.getHash(); + return hash_combine_range(Hash.begin(), Hash.end()); + } + + const CASContext &getContext() const { + assert(Context && "Tombstone or empty key for DenseMap?"); + return *Context; + } + + static CASID getDenseMapEmptyKey() { + return CASID(nullptr, DenseMapInfo::getEmptyKey()); + } + static CASID getDenseMapTombstoneKey() { + return CASID(nullptr, DenseMapInfo::getTombstoneKey()); + } + + CASID() = delete; + + static CASID create(const CASContext *Context, StringRef Hash) { + return CASID(Context, Hash); + } + +private: + CASID(const CASContext *Context, StringRef Hash) + : Context(Context), Hash(Hash) {} + + const CASContext *Context; + SmallString<32> Hash; +}; + +/// This is used to workaround the issue of MSVC needing default-constructible +/// types for \c std::promise/future. +template struct AsyncValue { + Expected> take() { return std::move(Value); } + + AsyncValue() : Value(std::nullopt) {} + AsyncValue(Error &&E) : Value(std::move(E)) {} + AsyncValue(T &&V) : Value(std::move(V)) {} + AsyncValue(std::nullopt_t) : Value(std::nullopt) {} + AsyncValue(Expected> &&Obj) : Value(std::move(Obj)) {} + +private: + Expected> Value; +}; + +} // namespace cas + +template <> struct DenseMapInfo { + static cas::CASID getEmptyKey() { return cas::CASID::getDenseMapEmptyKey(); } + + static cas::CASID getTombstoneKey() { + return cas::CASID::getDenseMapTombstoneKey(); + } + + static unsigned getHashValue(cas::CASID ID) { + return (unsigned)hash_value(ID); + } + + static bool isEqual(cas::CASID LHS, cas::CASID RHS) { return LHS == RHS; } +}; + +} // namespace llvm + +#endif // LLVM_CAS_CASID_H diff --git a/llvm/include/llvm/CAS/CASNodeSchema.h b/llvm/include/llvm/CAS/CASNodeSchema.h new file mode 100644 index 00000000000000..490337857cae0f --- /dev/null +++ b/llvm/include/llvm/CAS/CASNodeSchema.h @@ -0,0 +1,74 @@ +//===- llvm/CAS/CASNodeSchema.h ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_CASNODESCHEMA_H +#define LLVM_CAS_CASNODESCHEMA_H + +#include "llvm/CAS/CASReference.h" +#include "llvm/Support/ExtensibleRTTI.h" + +namespace llvm::cas { + +class ObjectProxy; + +/// A base class for schemas built on top of CAS nodes. +/// +/// TODO: Build a FilesystemSchema on top of this for reimplementing Trees on +/// top of the CAS. +class NodeSchema : public RTTIExtends { + void anchor() override; + +public: + static char ID; + + /// Check if \a Node is a root (entry node) for the schema. This is a strong + /// check, since it requires that the first reference matches a complete + /// type-id DAG. + virtual bool isRootNode(const cas::ObjectProxy &Node) const = 0; + + virtual bool isNode(const cas::ObjectProxy &Node) const = 0; + + cas::ObjectStore &CAS; + +protected: + NodeSchema(cas::ObjectStore &CAS) : CAS(CAS) {} + +public: + virtual ~NodeSchema() = default; +}; + +/// Creates all the schemas and can be used to retrieve a particular schema +/// based on a CAS root node. A client should aim to create and maximize re-use +/// of an instance of this object. +class SchemaPool { +public: + /// Look up the schema for the provided root node. Returns \a nullptr if no + /// schema was found or it's not actually a root node. The returned \p + /// NodeSchema pointer is owned by the \p SchemaPool instance, therefore it + /// cannot be used beyond the \p SchemaPool instance's lifetime. + /// + /// Thread-safe. + NodeSchema *getSchemaForRoot(cas::ObjectProxy Node) const; + + /// Add a schema to the pool. + void addSchema(std::unique_ptr S) { + Schemas.push_back(std::move(S)); + } + + cas::ObjectStore &getCAS() const { return CAS; } + + explicit SchemaPool(cas::ObjectStore &CAS) : CAS(CAS) {} + +private: + cas::ObjectStore &CAS; + SmallVector> Schemas; +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_CASNODESCHEMA_H diff --git a/llvm/include/llvm/CAS/CASReference.h b/llvm/include/llvm/CAS/CASReference.h new file mode 100644 index 00000000000000..1f435cf306c4ca --- /dev/null +++ b/llvm/include/llvm/CAS/CASReference.h @@ -0,0 +1,207 @@ +//===- llvm/CAS/CASReference.h ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_CASREFERENCE_H +#define LLVM_CAS_CASREFERENCE_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/StringRef.h" + +namespace llvm { + +class raw_ostream; + +namespace cas { + +class ObjectStore; + +class ObjectHandle; +class ObjectRef; + +/// Base class for references to things in \a ObjectStore. +class ReferenceBase { +protected: + struct DenseMapEmptyTag {}; + struct DenseMapTombstoneTag {}; + static constexpr uint64_t getDenseMapEmptyRef() { return -1ULL; } + static constexpr uint64_t getDenseMapTombstoneRef() { return -2ULL; } + +public: + /// Get an internal reference. + uint64_t getInternalRef(const ObjectStore &ExpectedCAS) const { +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + assert(CAS == &ExpectedCAS && "Extracting reference for the wrong CAS"); +#endif + return InternalRef; + } + + unsigned getDenseMapHash() const { + return (unsigned)llvm::hash_value(InternalRef); + } + bool isDenseMapEmpty() const { return InternalRef == getDenseMapEmptyRef(); } + bool isDenseMapTombstone() const { + return InternalRef == getDenseMapTombstoneRef(); + } + bool isDenseMapSentinel() const { + return isDenseMapEmpty() || isDenseMapTombstone(); + } + +protected: + void print(raw_ostream &OS, const ObjectHandle &This) const; + void print(raw_ostream &OS, const ObjectRef &This) const; + + bool hasSameInternalRef(const ReferenceBase &RHS) const { +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + assert( + (isDenseMapSentinel() || RHS.isDenseMapSentinel() || CAS == RHS.CAS) && + "Cannot compare across CAS instances"); +#endif + return InternalRef == RHS.InternalRef; + } + +protected: + friend class ObjectStore; + ReferenceBase(const ObjectStore *CAS, uint64_t InternalRef, bool IsHandle) + : InternalRef(InternalRef) { +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + this->CAS = CAS; +#endif + assert(InternalRef != getDenseMapEmptyRef() && "Reserved for DenseMapInfo"); + assert(InternalRef != getDenseMapTombstoneRef() && + "Reserved for DenseMapInfo"); + } + explicit ReferenceBase(DenseMapEmptyTag) + : InternalRef(getDenseMapEmptyRef()) {} + explicit ReferenceBase(DenseMapTombstoneTag) + : InternalRef(getDenseMapTombstoneRef()) {} + +private: + uint64_t InternalRef; + +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + const ObjectStore *CAS = nullptr; +#endif +}; + +/// Reference to an object in a \a ObjectStore instance. +/// +/// If you have an ObjectRef, you know the object exists, and you can point at +/// it from new nodes with \a ObjectStore::store(), but you don't know anything +/// about it. "Loading" the object is a separate step that may not have +/// happened yet, and which can fail (due to filesystem corruption) or +/// introduce latency (if downloading from a remote store). +/// +/// \a ObjectStore::store() takes a list of these, and these are returned by \a +/// ObjectStore::forEachRef() and \a ObjectStore::readRef(), which are accessors +/// for nodes, and \a ObjectStore::getReference(). +/// +/// \a ObjectStore::load() will load the referenced object, and returns \a +/// ObjectHandle, a variant that knows what kind of entity it is. \a +/// ObjectStore::getReferenceKind() can expect the type of reference without +/// asking for unloaded objects to be loaded. +/// +/// This is a wrapper around a \c uint64_t (and a \a ObjectStore instance when +/// assertions are on). If necessary, it can be deconstructed and reconstructed +/// using \a Reference::getInternalRef() and \a +/// Reference::getFromInternalRef(), but clients aren't expected to need to do +/// this. These both require the right \a ObjectStore instance. +class ObjectRef : public ReferenceBase { + struct DenseMapTag {}; + +public: + friend bool operator==(const ObjectRef &LHS, const ObjectRef &RHS) { + return LHS.hasSameInternalRef(RHS); + } + friend bool operator!=(const ObjectRef &LHS, const ObjectRef &RHS) { + return !(LHS == RHS); + } + + /// Allow a reference to be recreated after it's deconstructed. + static ObjectRef getFromInternalRef(const ObjectStore &CAS, + uint64_t InternalRef) { + return ObjectRef(CAS, InternalRef); + } + + static ObjectRef getDenseMapEmptyKey() { + return ObjectRef(DenseMapEmptyTag{}); + } + static ObjectRef getDenseMapTombstoneKey() { + return ObjectRef(DenseMapTombstoneTag{}); + } + + /// Print internal ref and/or CASID. Only suitable for debugging. + void print(raw_ostream &OS) const { return ReferenceBase::print(OS, *this); } + + LLVM_DUMP_METHOD void dump() const; + +private: + friend class ObjectStore; + friend class ReferenceBase; + using ReferenceBase::ReferenceBase; + ObjectRef(const ObjectStore &CAS, uint64_t InternalRef) + : ReferenceBase(&CAS, InternalRef, /*IsHandle=*/false) { + assert(InternalRef != -1ULL && "Reserved for DenseMapInfo"); + assert(InternalRef != -2ULL && "Reserved for DenseMapInfo"); + } + explicit ObjectRef(DenseMapEmptyTag T) : ReferenceBase(T) {} + explicit ObjectRef(DenseMapTombstoneTag T) : ReferenceBase(T) {} + explicit ObjectRef(ReferenceBase) = delete; +}; + +/// Handle to a loaded object in a \a ObjectStore instance. +/// +/// ObjectHandle encapulates a *loaded* object in the CAS. You need one +/// of these to inspect the content of an object: to look at its stored +/// data and references. +class ObjectHandle : public ReferenceBase { +public: + friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) { + return LHS.hasSameInternalRef(RHS); + } + friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) { + return !(LHS == RHS); + } + + /// Print internal ref and/or CASID. Only suitable for debugging. + void print(raw_ostream &OS) const { return ReferenceBase::print(OS, *this); } + + LLVM_DUMP_METHOD void dump() const; + +private: + friend class ObjectStore; + friend class ReferenceBase; + using ReferenceBase::ReferenceBase; + explicit ObjectHandle(ReferenceBase) = delete; + ObjectHandle(const ObjectStore &CAS, uint64_t InternalRef) + : ReferenceBase(&CAS, InternalRef, /*IsHandle=*/true) {} +}; + +} // namespace cas + +template <> struct DenseMapInfo { + static cas::ObjectRef getEmptyKey() { + return cas::ObjectRef::getDenseMapEmptyKey(); + } + + static cas::ObjectRef getTombstoneKey() { + return cas::ObjectRef::getDenseMapTombstoneKey(); + } + + static unsigned getHashValue(cas::ObjectRef Ref) { + return Ref.getDenseMapHash(); + } + + static bool isEqual(cas::ObjectRef LHS, cas::ObjectRef RHS) { + return LHS == RHS; + } +}; + +} // namespace llvm + +#endif // LLVM_CAS_CASREFERENCE_H diff --git a/llvm/include/llvm/CAS/CASRegistry.h b/llvm/include/llvm/CAS/CASRegistry.h new file mode 100644 index 00000000000000..4654e91d39b49a --- /dev/null +++ b/llvm/include/llvm/CAS/CASRegistry.h @@ -0,0 +1,47 @@ +//===- llvm/CAS/CASRegistry.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/CASID.h" +#include "llvm/CAS/CASReference.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/Error.h" + +#ifndef LLVM_CAS_CASREGISTRY_H +#define LLVM_CAS_CASREGISTRY_H + +namespace llvm::cas { + +/// Create ObjectStore from a string identifier. +/// Currently the string identifier is using URL scheme with following supported +/// schemes: +/// * InMemory CAS: mem:// +/// * OnDisk CAS: file://${PATH_TO_ONDISK_CAS} +/// * PlugIn CAS: plugin://${PATH_TO_PLUGIN}?${OPT1}=${VAL1}&${OPT2}=${VAL2}.. +/// If no URL scheme is used, it defaults to following (but might change in +/// future) +/// For the plugin scheme, use argument "ondisk-path=${PATH}" to choose the +/// on-disk directory that the plugin should use, otherwise the default +/// OnDiskCAS location will be used. +/// FIXME: Need to implement proper URL encoding scheme that allows "%". +Expected, std::shared_ptr>> +createCASFromIdentifier(StringRef Id); + +/// Check if a string is a CAS identifier. +bool isRegisteredCASIdentifier(StringRef Config); + +/// Register a URL scheme to CAS Identifier. +using ObjectStoreCreateFuncTy = Expected< + std::pair, std::shared_ptr>>( + const Twine &); +void registerCASURLScheme(StringRef Prefix, ObjectStoreCreateFuncTy *Func); + +} // namespace llvm::cas + +#endif diff --git a/llvm/include/llvm/CAS/HierarchicalTreeBuilder.h b/llvm/include/llvm/CAS/HierarchicalTreeBuilder.h new file mode 100644 index 00000000000000..11ab3ec8629eb2 --- /dev/null +++ b/llvm/include/llvm/CAS/HierarchicalTreeBuilder.h @@ -0,0 +1,86 @@ +//===- llvm/CAS/HierarchicalTreeBuilder.h -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_HIERARCHICALTREEBUILDER_H +#define LLVM_CAS_HIERARCHICALTREEBUILDER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/CASReference.h" +#include "llvm/CAS/TreeEntry.h" +#include "llvm/CAS/TreeSchema.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include + +namespace llvm::cas { + +class ObjectStore; + +/// Structure to facilitating building full tree hierarchies. +class HierarchicalTreeBuilder { + struct HierarchicalEntry { + public: + StringRef getPath() const { return Path; } + std::optional getRef() const { return Ref; } + TreeEntry::EntryKind getKind() const { return Kind; } + + HierarchicalEntry(std::optional Ref, TreeEntry::EntryKind Kind, + StringRef Path) + : Ref(Ref), Kind(Kind), Path(Path.str()) { + assert(Ref || Kind == TreeEntry::Tree); + } + + private: + std::optional Ref; + TreeEntry::EntryKind Kind; + std::string Path; + }; + + /// Preallocate space for small trees, common when creating cache keys. + SmallVector Entries; + SmallVector TreeContents; + + void pushImpl(std::optional Ref, TreeEntry::EntryKind Kind, + const Twine &Path); + +public: + /// Add a hierarchical entry at \p Path, which is expected to be from the + /// top-level (otherwise, the caller should prepend a working directory). + /// + /// All ".." components will be squashed by eating the parent. Paths through + /// symlinks will not work, and should be resolved ahead of time. Paths must + /// be POSIX-style. + void push(ObjectRef Ref, TreeEntry::EntryKind Kind, const Twine &Path) { + return pushImpl(Ref, Kind, Path); + } + + /// Add a directory. Ensures the directory will exist even if there are no + /// files pushed from within it. + void pushDirectory(const Twine &Path) { + return pushImpl(std::nullopt, TreeEntry::Tree, Path); + } + + /// Add a directory with specific contents. It is functionally equivalent to: + /// * Calling pushDirectory() for every tree + /// * Calling push() for every non-tree + /// + /// Allows merging the contents of multiple directories. + void pushTreeContent(ObjectRef Ref, const Twine &Path); + + /// Drop all entries. + void clear() { Entries.clear(); } + + /// Recursively create the trees implied by calls to \a push(), return the + /// top-level \a CASID. + Expected create(ObjectStore &CAS); +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_HIERARCHICALTREEBUILDER_H diff --git a/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h new file mode 100644 index 00000000000000..ac97158d48ddb5 --- /dev/null +++ b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h @@ -0,0 +1,126 @@ +//===- MappedFileRegionBumpPtr.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H +#define LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H + +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/FileSystem.h" +#include + +namespace llvm::cas { + +/// Allocator for an owned mapped file region that supports thread-safe and +/// process-safe bump pointer allocation. +/// +/// This allocator is designed to create a sparse file when supported by the +/// filesystem's \c ftruncate so that it can be used with a large maximum size. +/// It will also attempt to shrink the underlying file down to its current +/// allocation size when the last concurrent mapping is closed. +/// +/// Process-safe. Uses file locks when resizing the file during initialization +/// and destruction. +/// +/// Thread-safe, assuming all threads use the same instance to talk to a given +/// file/mapping. Unsafe to have multiple instances talking to the same file +/// in the same process since file locks will misbehave. Clients should +/// coordinate (somehow). +/// +/// \note Currently we allocate the whole file without sparseness on Windows. +/// +/// Provides 8-byte alignment for all allocations. +class MappedFileRegionBumpPtr { +public: + using RegionT = sys::fs::mapped_file_region; + + /// Create a \c MappedFileRegionBumpPtr. + /// + /// \param Path the path to open the mapped region. + /// \param Capacity the maximum size for the mapped file region. + /// \param BumpPtrOffset the offset at which to store the bump pointer. + /// \param NewFileConstructor is for constructing new files. It has exclusive + /// access to the file. Must call \c initializeBumpPtr. + static Expected + create(const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset, + function_ref NewFileConstructor); + + /// Create a \c MappedFileRegionBumpPtr., shared across the process via a + /// singleton map. + /// + /// FIXME: Singleton map should be based on sys::fs::UniqueID, but currently + /// it is just based on \p Path. + /// + /// \param Path the path to open the mapped region. + /// \param Capacity the maximum size for the mapped file region. + /// \param BumpPtrOffset the offset at which to store the bump pointer. + /// \param NewFileConstructor is for constructing new files. It has exclusive + /// access to the file. Must call \c initializeBumpPtr. + static Expected> createShared( + const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset, + function_ref NewFileConstructor); + + /// Finish initializing the bump pointer. Must be called by + /// \c NewFileConstructor. + void initializeBumpPtr(int64_t BumpPtrOffset); + + /// Minimum alignment for allocations, currently hardcoded to 8B. + static constexpr Align getAlign() { + // Trick Align into giving us '8' as a constexpr. + struct alignas(8) T {}; + static_assert(alignof(T) == 8, "Tautology failed?"); + return Align::Of(); + } + + /// Allocate at least \p AllocSize. Rounds up to \a getAlign(). + char *allocate(uint64_t AllocSize) { + return data() + allocateOffset(AllocSize); + } + /// Allocate, returning the offset from \a data() instead of a pointer. + int64_t allocateOffset(uint64_t AllocSize); + + char *data() const { return Region.data(); } + uint64_t size() const { return *BumpPtr; } + uint64_t capacity() const { return Region.size(); } + + RegionT &getRegion() { return Region; } + + ~MappedFileRegionBumpPtr() { destroyImpl(); } + + MappedFileRegionBumpPtr() = default; + MappedFileRegionBumpPtr(MappedFileRegionBumpPtr &&RHS) { moveImpl(RHS); } + MappedFileRegionBumpPtr &operator=(MappedFileRegionBumpPtr &&RHS) { + destroyImpl(); + moveImpl(RHS); + return *this; + } + + MappedFileRegionBumpPtr(const MappedFileRegionBumpPtr &) = delete; + MappedFileRegionBumpPtr &operator=(const MappedFileRegionBumpPtr &) = delete; + +private: + void destroyImpl(); + void moveImpl(MappedFileRegionBumpPtr &RHS) { + std::swap(Region, RHS.Region); + std::swap(BumpPtr, RHS.BumpPtr); + std::swap(Path, RHS.Path); + std::swap(FD, RHS.FD); + std::swap(SharedLockFD, RHS.SharedLockFD); + } + +private: + RegionT Region; + std::atomic *BumpPtr = nullptr; + std::string Path; + std::optional FD; + std::optional SharedLockFD; +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h new file mode 100644 index 00000000000000..931f5046a5bc1c --- /dev/null +++ b/llvm/include/llvm/CAS/ObjectStore.h @@ -0,0 +1,339 @@ +//===- llvm/CAS/ObjectStore.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_OBJECTSTORE_H +#define LLVM_CAS_OBJECTSTORE_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/CASID.h" +#include "llvm/CAS/CASReference.h" +#include "llvm/CAS/TreeEntry.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include +#include + +namespace llvm { + +class MemoryBuffer; +template class unique_function; + +namespace cas { + +class ObjectStore; +class ObjectProxy; + +using AsyncProxyValue = AsyncValue; + +/// Content-addressable storage for objects. +/// +/// Conceptually, objects are stored in a "unique set". +/// +/// - Objects are immutable ("value objects") that are defined by their +/// content. They are implicitly deduplicated by content. +/// - Each object has a unique identifier (UID) that's derived from its content, +/// called a \a CASID. +/// - This UID is a fixed-size (strong) hash of the transitive content of a +/// CAS object. +/// - It's comparable between any two CAS instances that have the same \a +/// CASIDContext::getHashSchemaIdentifier(). +/// - The UID can be printed (e.g., \a CASID::toString()) and it can parsed +/// by the same or a different CAS instance with \a +/// ObjectStore::parseID(). +/// - An object can be looked up by content or by UID. +/// - \a store() is "get-or-create" methods, writing an object if it +/// doesn't exist yet, and return a ref to it in any case. +/// - \a loadObject(const CASID&) looks up an object by its UID. +/// - Objects can reference other objects, forming an arbitrary DAG. +/// +/// The \a ObjectStore interface has a few ways of referencing objects: +/// +/// - \a ObjectRef encapsulates a reference to something in the CAS. It is an +/// opaque type that references an object inside a specific CAS. It is +/// implementation defined if the underlying object exists or not for an +/// ObjectRef, and it can used to speed up CAS lookup as an implementation +/// detail. However, you don't know anything about the underlying objects. +/// "Loading" the object is a separate step that may not have happened +/// yet, and which can fail (e.g. due to filesystem corruption) or introduce +/// latency (if downloading from a remote store). +/// - \a ObjectHandle encapulates a *loaded* object in the CAS. You need one of +/// these to inspect the content of an object: to look at its stored +/// data and references. This is internal to CAS implementation and not +/// availble from CAS public APIs. +/// - \a CASID: the UID for an object in the CAS, obtained through \a +/// ObjectStore::getID() or \a ObjectStore::parseID(). This is a valid CAS +/// identifier, but may reference an object that is unknown to this CAS +/// instance. +/// - \a ObjectProxy pairs an ObjectHandle (subclass) with a ObjectStore, and +/// wraps access APIs to avoid having to pass extra parameters. It is the +/// object used for accessing underlying data and refs by CAS users. +/// +/// There are a few options for accessing content of objects, with different +/// lifetime tradeoffs: +/// +/// - \a getData() accesses data without exposing lifetime at all. +/// - \a getMemoryBuffer() returns a \a MemoryBuffer whose lifetime +/// is independent of the CAS (it can live longer). +/// - \a getDataString() return StringRef with lifetime is guaranteed to last as +/// long as \a ObjectStore. +/// - \a readRef() and \a forEachRef() iterate through the references in an +/// object. There is no lifetime assumption. +/// +/// Both ObjectRef and ObjectHandle are lightweight, wrapping a `uint64_t`. +/// Doing anything with them requires a ObjectStore. As a convenience: +class ObjectStore { + friend class ObjectProxy; + void anchor(); + +public: + /// Get a \p CASID from a \p ID, which should have been generated by \a + /// CASID::print(). This succeeds as long as \a validateID() would pass. The + /// object may be unknown to this CAS instance. + /// + /// TODO: Remove, and update callers to use \a validateID() or \a + /// extractHashFromID(). + virtual Expected parseID(StringRef ID) = 0; + + /// Store object into ObjectStore. + virtual Expected store(ArrayRef Refs, + ArrayRef Data) = 0; + /// Get an ID for \p Ref. + virtual CASID getID(ObjectRef Ref) const = 0; + + /// Get an existing reference to the object called \p ID. + /// + /// Returns \c None if the object is not stored in this CAS. + virtual std::optional getReference(const CASID &ID) const = 0; + + /// \returns true if the object is directly available from the local CAS, for + /// implementations that have this kind of distinction. + virtual Expected isMaterialized(ObjectRef Ref) const = 0; + + /// Validate the underlying object referred by CASID. + virtual Error validate(const CASID &ID) = 0; + +protected: + /// Load the object referenced by \p Ref. + /// + /// Errors if the object cannot be loaded. + /// \returns \c std::nullopt if the object is missing from the CAS. + virtual Expected> loadIfExists(ObjectRef Ref) = 0; + + /// Asynchronous version of \c loadIfExists. + virtual void loadIfExistsAsync( + ObjectRef Ref, + unique_function>)> Callback); + + /// Like \c loadIfExists but returns an error if the object is missing. + Expected load(ObjectRef Ref); + + /// Get the size of some data. + virtual uint64_t getDataSize(ObjectHandle Node) const = 0; + + /// Methods for handling objects. + virtual Error forEachRef(ObjectHandle Node, + function_ref Callback) const = 0; + virtual ObjectRef readRef(ObjectHandle Node, size_t I) const = 0; + virtual size_t getNumRefs(ObjectHandle Node) const = 0; + virtual ArrayRef getData(ObjectHandle Node, + bool RequiresNullTerminator = false) const = 0; + + /// Get ObjectRef from open file. + virtual Expected + storeFromOpenFileImpl(sys::fs::file_t FD, + std::optional Status); + + /// Get a lifetime-extended StringRef pointing at \p Data. + /// + /// Depending on the CAS implementation, this may involve in-memory storage + /// overhead. + StringRef getDataString(ObjectHandle Node) { + return toStringRef(getData(Node)); + } + + /// Get a lifetime-extended MemoryBuffer pointing at \p Data. + /// + /// Depending on the CAS implementation, this may involve in-memory storage + /// overhead. + std::unique_ptr + getMemoryBuffer(ObjectHandle Node, StringRef Name = "", + bool RequiresNullTerminator = true); + + /// Read all the refs from object in a SmallVector. + virtual void readRefs(ObjectHandle Node, + SmallVectorImpl &Refs) const; + + /// Allow ObjectStore implementations to create internal handles. +#define MAKE_CAS_HANDLE_CONSTRUCTOR(HandleKind) \ + HandleKind make##HandleKind(uint64_t InternalRef) const { \ + return HandleKind(*this, InternalRef); \ + } + MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectHandle) + MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectRef) +#undef MAKE_CAS_HANDLE_CONSTRUCTOR + +public: + /// Helper functions to store object and returns a ObjectProxy. + Expected createProxy(ArrayRef Refs, StringRef Data); + + /// Store object from StringRef. + Expected storeFromString(ArrayRef Refs, + StringRef String) { + return store(Refs, arrayRefFromStringRef(String)); + } + + /// Default implementation reads \p FD and calls \a storeNode(). Does not + /// take ownership of \p FD; the caller is responsible for closing it. + /// + /// If \p Status is sent in it is to be treated as a hint. Implementations + /// must protect against the file size potentially growing after the status + /// was taken (i.e., they cannot assume that an mmap will be null-terminated + /// where \p Status implies). + /// + /// Returns the \a CASID and the size of the file. + Expected + storeFromOpenFile(sys::fs::file_t FD, + std::optional Status = std::nullopt) { + return storeFromOpenFileImpl(FD, Status); + } + + static Error createUnknownObjectError(const CASID &ID); + + /// Create ObjectProxy from CASID. If the object doesn't exist, get an error. + Expected getProxy(const CASID &ID); + /// Create ObjectProxy from ObjectRef. If the object can't be loaded, get an + /// error. + Expected getProxy(ObjectRef Ref); + + /// \returns \c std::nullopt if the object is missing from the CAS. + Expected> getProxyIfExists(ObjectRef Ref); + + /// Asynchronous version of \c getProxyIfExists. + std::future getProxyFuture(ObjectRef Ref); + + /// Asynchronous version of \c getProxyIfExists using a callback. + void getProxyAsync( + ObjectRef Ref, + unique_function>)> Callback); + + /// Read the data from \p Data into \p OS. + uint64_t readData(ObjectHandle Node, raw_ostream &OS, uint64_t Offset = 0, + uint64_t MaxBytes = -1ULL) const { + ArrayRef Data = getData(Node); + assert(Offset < Data.size() && "Expected valid offset"); + Data = Data.drop_front(Offset).take_front(MaxBytes); + OS << toStringRef(Data); + return Data.size(); + } + + /// Validate the whole node tree. + Error validateTree(ObjectRef Ref); + + /// Print the ObjectStore internals for debugging purpose. + virtual void print(raw_ostream &) const {} + void dump() const; + + /// Get CASContext + const CASContext &getContext() const { return Context; } + + virtual ~ObjectStore() = default; + +protected: + ObjectStore(const CASContext &Context) : Context(Context) {} + +private: + const CASContext &Context; +}; + +/// Reference to an abstract hierarchical node, with data and references. +/// Reference is passed by value and is expected to be valid as long as the \a +/// ObjectStore is. +class ObjectProxy { +public: + const ObjectStore &getCAS() const { return *CAS; } + ObjectStore &getCAS() { return *CAS; } + CASID getID() const { return CAS->getID(Ref); } + ObjectRef getRef() const { return Ref; } + size_t getNumReferences() const { return CAS->getNumRefs(H); } + ObjectRef getReference(size_t I) const { return CAS->readRef(H, I); } + + operator CASID() const { return getID(); } + CASID getReferenceID(size_t I) const { + std::optional ID = getCAS().getID(getReference(I)); + assert(ID && "Expected reference to be first-class object"); + return *ID; + } + + /// Visit each reference in order, returning an error from \p Callback to + /// stop early. + Error forEachReference(function_ref Callback) const { + return CAS->forEachRef(H, Callback); + } + + std::unique_ptr + getMemoryBuffer(StringRef Name = "", + bool RequiresNullTerminator = true) const; + + /// Get the content of the node. Valid as long as the CAS is valid. + StringRef getData() const { return CAS->getDataString(H); } + + friend bool operator==(const ObjectProxy &Proxy, ObjectRef Ref) { + return Proxy.getRef() == Ref; + } + friend bool operator==(ObjectRef Ref, const ObjectProxy &Proxy) { + return Proxy.getRef() == Ref; + } + friend bool operator!=(const ObjectProxy &Proxy, ObjectRef Ref) { + return !(Proxy.getRef() == Ref); + } + friend bool operator!=(ObjectRef Ref, const ObjectProxy &Proxy) { + return !(Proxy.getRef() == Ref); + } + +public: + ObjectProxy() = delete; + + static ObjectProxy load(ObjectStore &CAS, ObjectRef Ref, ObjectHandle Node) { + return ObjectProxy(CAS, Ref, Node); + } + +private: + ObjectProxy(ObjectStore &CAS, ObjectRef Ref, ObjectHandle H) + : CAS(&CAS), Ref(Ref), H(H) {} + + ObjectStore *CAS; + ObjectRef Ref; + ObjectHandle H; +}; + +std::unique_ptr createInMemoryCAS(); + +/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled. +bool isOnDiskCASEnabled(); + +/// Gets or creates a persistent on-disk path at \p Path. +/// +/// Deprecated: if \p Path resolves to \a getDefaultOnDiskCASStableID(), +/// automatically opens \a getDefaultOnDiskCASPath() instead. +/// +/// FIXME: Remove the special behaviour for getDefaultOnDiskCASStableID(). The +/// client should handle this logic, if/when desired. +Expected> createOnDiskCAS(const Twine &Path); + +/// Set \p Path to a reasonable default on-disk path for a persistent CAS for +/// the current user. +void getDefaultOnDiskCASPath(SmallVectorImpl &Path); + +/// Get a reasonable default on-disk path for a persistent CAS for the current +/// user. +std::string getDefaultOnDiskCASPath(); + +} // namespace cas +} // namespace llvm + +#endif // LLVM_CAS_OBJECTSTORE_H diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h new file mode 100644 index 00000000000000..cfcaeed9fc85a0 --- /dev/null +++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h @@ -0,0 +1,406 @@ +//===- OnDiskGraphDB.h ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_ONDISKGRAPHDB_H +#define LLVM_CAS_ONDISKGRAPHDB_H + +#include "llvm/ADT/PointerUnion.h" +#include "llvm/CAS/OnDiskHashMappedTrie.h" + +namespace llvm::cas::ondisk { + +/// 8B reference. +class InternalRef { +public: + FileOffset getFileOffset() const { return FileOffset(getRawOffset()); } + + uint64_t getRawData() const { return Data; } + uint64_t getRawOffset() const { return Data; } + + static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); } + + static InternalRef getFromOffset(FileOffset Offset) { + return InternalRef(Offset.get()); + } + + friend bool operator==(InternalRef LHS, InternalRef RHS) { + return LHS.Data == RHS.Data; + } + +private: + InternalRef(FileOffset Offset) : Data((uint64_t)Offset.get()) {} + InternalRef(uint64_t Data) : Data(Data) {} + uint64_t Data; +}; + +/// 4B reference. +class InternalRef4B { +public: + FileOffset getFileOffset() const { return FileOffset(Data); } + + uint32_t getRawData() const { return Data; } + + /// Shrink to 4B reference. + static std::optional tryToShrink(InternalRef Ref) { + uint64_t Offset = Ref.getRawOffset(); + if (Offset > UINT32_MAX) + return std::nullopt; + + return InternalRef4B(Offset); + } + + operator InternalRef() const { + return InternalRef::getFromOffset(getFileOffset()); + } + +private: + friend class InternalRef; + InternalRef4B(uint32_t Data) : Data(Data) {} + uint32_t Data; +}; + +/// Array of internal node references. +class InternalRefArrayRef { +public: + size_t size() const { return Size; } + bool empty() const { return !Size; } + + class iterator + : public iterator_facade_base { + public: + bool operator==(const iterator &RHS) const { return I == RHS.I; } + InternalRef operator*() const { + if (auto *Ref = dyn_cast(I)) + return *Ref; + return InternalRef(*I.get()); + } + bool operator<(const iterator &RHS) const { + assert(I.is() == RHS.I.is()); + if (auto *Ref = dyn_cast(I)) + return Ref < RHS.I.get(); + return I.get() - + RHS.I.get(); + } + ptrdiff_t operator-(const iterator &RHS) const { + assert(I.is() == RHS.I.is()); + if (auto *Ref = dyn_cast(I)) + return Ref - RHS.I.get(); + return I.get() - + RHS.I.get(); + } + iterator &operator+=(ptrdiff_t N) { + if (auto *Ref = dyn_cast(I)) + I = Ref + N; + else + I = I.get() + N; + return *this; + } + iterator &operator-=(ptrdiff_t N) { + if (auto *Ref = dyn_cast(I)) + I = Ref - N; + else + I = I.get() - N; + return *this; + } + InternalRef operator[](ptrdiff_t N) const { return *(this->operator+(N)); } + + iterator() = default; + + uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); } + + static iterator fromOpaqueData(uint64_t Opaque) { + return iterator( + PointerUnion::getFromOpaqueValue((void *) + Opaque)); + } + + private: + friend class InternalRefArrayRef; + explicit iterator( + PointerUnion I) + : I(I) {} + PointerUnion I; + }; + + bool operator==(const InternalRefArrayRef &RHS) const { + return size() == RHS.size() && std::equal(begin(), end(), RHS.begin()); + } + + iterator begin() const { return iterator(Begin); } + iterator end() const { return begin() + Size; } + + /// Array accessor. + InternalRef operator[](ptrdiff_t N) const { return begin()[N]; } + + bool is4B() const { return Begin.is(); } + bool is8B() const { return Begin.is(); } + + ArrayRef getBuffer() const { + if (is4B()) { + auto *B = Begin.get(); + return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size); + } else { + auto *B = Begin.get(); + return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size); + } + } + + InternalRefArrayRef(std::nullopt_t = std::nullopt) { + // This is useful so that all the casts in the \p iterator functions can + // operate without needing to check for a null value. + static InternalRef PlaceHolder = InternalRef::getFromRawData(0); + Begin = &PlaceHolder; + } + + InternalRefArrayRef(ArrayRef Refs) + : Begin(Refs.begin()), Size(Refs.size()) {} + + InternalRefArrayRef(ArrayRef Refs) + : Begin(Refs.begin()), Size(Refs.size()) {} + +private: + PointerUnion Begin; + size_t Size = 0; +}; + +struct OnDiskContent; + +/// Reference to a node. The node's data may not be stored in the database. +/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance +/// it came from. \p ObjectIDs from different \p OnDiskGraphDB instances are not +/// comparable. +class ObjectID { +public: + uint64_t getOpaqueData() const { return Opaque; } + + static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); } + + friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) { + return LHS.Opaque == RHS.Opaque; + } + friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) { + return !(LHS == RHS); + } + +private: + explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {} + uint64_t Opaque; +}; + +/// Handle for a loaded node object. +class ObjectHandle { +public: + uint64_t getOpaqueData() const { return Opaque; } + + static ObjectHandle fromOpaqueData(uint64_t Opaque) { + return ObjectHandle(Opaque); + } + + friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) { + return LHS.Opaque == RHS.Opaque; + } + friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) { + return !(LHS == RHS); + } + +private: + explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {} + uint64_t Opaque; +}; + +class object_refs_iterator + : public iterator_facade_base { +public: + bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; } + ObjectID operator*() const { + return ObjectID::fromOpaqueData((*I).getRawData()); + } + bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; } + ptrdiff_t operator-(const object_refs_iterator &RHS) const { + return I - RHS.I; + } + object_refs_iterator &operator+=(ptrdiff_t N) { + I += N; + return *this; + } + object_refs_iterator &operator-=(ptrdiff_t N) { + I -= N; + return *this; + } + ObjectID operator[](ptrdiff_t N) const { return *(this->operator+(N)); } + + object_refs_iterator() = default; + object_refs_iterator(InternalRefArrayRef::iterator I) : I(I) {} + + uint64_t getOpaqueData() const { return I.getOpaqueData(); } + + static object_refs_iterator fromOpaqueData(uint64_t Opaque) { + return InternalRefArrayRef::iterator::fromOpaqueData(Opaque); + } + +private: + InternalRefArrayRef::iterator I; +}; + +using object_refs_range = llvm::iterator_range; + +/// On-disk CAS nodes database, independent of a particular hashing algorithm. +class OnDiskGraphDB { +public: + /// Associate data & references with a particular object ID. If there is + /// already a record for this object the operation is a no-op. \param ID the + /// object ID to associate the data & references with. \param Refs references + /// \param Data data buffer. + Error store(ObjectID ID, ArrayRef Refs, ArrayRef Data); + + /// \returns \p nullopt if the object associated with \p Ref does not exist. + Expected> load(ObjectID Ref); + + /// \returns the hash bytes digest for the object reference. + ArrayRef getDigest(ObjectID Ref) const { + return getDigest(getInternalRef(Ref)); + } + + /// Form a reference for the provided hash. The reference can be used as part + /// of a CAS object even if it's not associated with an object yet. + ObjectID getReference(ArrayRef Hash); + + /// Get an existing reference to the object \p Digest. + /// + /// Returns \p nullopt if the object is not stored in this CAS. + std::optional getExistingReference(ArrayRef Digest); + + /// \returns true if the object associated with \p Ref is stored in the CAS. + bool containsObject(ObjectID Ref) const { + return containsObject(Ref, /*CheckUpstream=*/true); + } + + /// \returns the data part of the provided object handle. + ArrayRef getObjectData(ObjectHandle Node) const; + + object_refs_range getObjectRefs(ObjectHandle Node) const { + InternalRefArrayRef Refs = getInternalRefs(Node); + return make_range(Refs.begin(), Refs.end()); + } + + /// \returns Total size of stored objects. + /// + /// NOTE: There's a possibility that the returned size is not including a + /// large object if the process crashed right at the point of inserting it. + size_t getStorageSize() const; + + void print(raw_ostream &OS) const; + + /// How to fault-in nodes if an upstream database is used. + enum class FaultInPolicy { + /// Copy only the requested node. + SingleNode, + /// Copy the the entire graph of a node. + FullTree, + }; + + /// Open the on-disk store from a directory. + /// + /// \param Path directory for the on-disk store. The directory will be created + /// if it doesn't exist. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes + /// if they don't exist in the primary store. The upstream store is only used + /// for reading nodes, new nodes are only written to the primary store. + /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied + /// to primary store. This is recorded at creation time and subsequent opens + /// need to pass the same policy otherwise the \p open will fail. + static Expected> + open(StringRef Path, StringRef HashName, unsigned HashByteSize, + std::unique_ptr UpstreamDB = nullptr, + FaultInPolicy Policy = FaultInPolicy::FullTree); + + ~OnDiskGraphDB(); + +private: + struct IndexProxy; + class TempFile; + class MappedTempFile; + + bool containsObject(ObjectID Ref, bool CheckUpstream) const; + + /// When \p load is called for a node that doesn't exist, this function tries + /// to load it from the upstream store and copy it to the primary one. + Expected> faultInFromUpstream(ObjectID PrimaryID); + Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode); + Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode); + + IndexProxy indexHash(ArrayRef Hash); + + Error createStandaloneLeaf(IndexProxy &I, ArrayRef Data); + + Expected createTempFile(StringRef FinalPath, uint64_t Size); + + OnDiskContent getContentFromHandle(ObjectHandle H) const; + + static InternalRef getInternalRef(ObjectID Ref) { + return InternalRef::getFromRawData(Ref.getOpaqueData()); + } + static ObjectID getExternalReference(InternalRef Ref) { + return ObjectID::fromOpaqueData(Ref.getRawData()); + } + + static ObjectID getExternalReference(const IndexProxy &I); + + void getStandalonePath(StringRef FileSuffix, const IndexProxy &I, + SmallVectorImpl &Path) const; + + ArrayRef getDigest(InternalRef Ref) const; + ArrayRef getDigest(const IndexProxy &I) const; + + IndexProxy getIndexProxyFromRef(InternalRef Ref) const; + + static InternalRef makeInternalRef(FileOffset IndexOffset); + + IndexProxy + getIndexProxyFromPointer(OnDiskHashMappedTrie::const_pointer P) const; + + InternalRefArrayRef getInternalRefs(ObjectHandle Node) const; + + void recordStandaloneSizeIncrease(size_t SizeIncrease); + + std::atomic &getStandaloneStorageSize(); + uint64_t getStandaloneStorageSize() const; + + OnDiskGraphDB(StringRef RootPath, OnDiskHashMappedTrie Index, + OnDiskDataAllocator DataPool, + std::unique_ptr UpstreamDB, + FaultInPolicy Policy); + + /// Mapping from hash to object reference. + /// + /// Data type is TrieRecord. + OnDiskHashMappedTrie Index; + + /// Storage for most objects. + /// + /// Data type is DataRecordHandle. + OnDiskDataAllocator DataPool; + + void *StandaloneData; // a StandaloneDataMap. + + std::string RootPath; + + /// Optional on-disk store to be used for faulting-in nodes. + std::unique_ptr UpstreamDB; + FaultInPolicy FIPolicy; +}; + +} // namespace llvm::cas::ondisk + +#endif // LLVM_CAS_ONDISKGRAPHDB_H diff --git a/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h b/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h new file mode 100644 index 00000000000000..410e5d955eec2b --- /dev/null +++ b/llvm/include/llvm/CAS/OnDiskHashMappedTrie.h @@ -0,0 +1,391 @@ +//===- OnDiskHashMappedTrie.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_ONDISKHASHMAPPEDTRIE_H +#define LLVM_CAS_ONDISKHASHMAPPEDTRIE_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/FileSystem.h" +#include +#include +#include + +namespace llvm { + +class MemoryBuffer; +class raw_ostream; + +namespace cas { + +class FileOffset { +public: + int64_t get() const { return Offset; } + + explicit operator bool() const { return Offset; } + + FileOffset() = default; + explicit FileOffset(int64_t Offset) : Offset(Offset) { assert(Offset >= 0); } + +private: + int64_t Offset = 0; +}; + +/// On-disk hash-mapped trie. Thread-safe / lock-free. +/// +/// This is an on-disk, (mostly) thread-safe key-value store that is (mostly) +/// lock-free. The keys are fixed length, and are expected to be binary hashes +/// with a normal distribution. +/// +/// - Thread-safety is achieved through the use of atomics within a shared +/// memory mapping. Atomic access does not work on networked filesystems. +/// - Filesystem locks are used, but only sparingly: +/// - during initialization, for creating / opening an existing store; +/// - for the lifetime of the instance, a shared/reader lock is held +/// - during destruction, if there are no concurrent readers, to shrink the +/// files to their minimum size. +/// - Path is used as a directory: +/// - "index" stores the root trie and subtries. +/// - "data" stores (most of) the entries, like a bump-ptr-allocator. +/// - Large entries are stored externally in a file named by the key. +/// - Code is system-dependent (Windows not yet implemented), and binary format +/// itself is not portable. These are not artifacts that can/should be moved +/// between different systems; they are only appropriate for local storage. +/// +/// FIXME: Add support for storing top-level metadata or identifiers that can +/// be created / read during initialization. +/// +/// FIXME: Implement for Windows. See comment next to implementation of \a +/// OnDiskHashMappedTrie::MappedFileInfo::open(). +class OnDiskHashMappedTrie { +public: + LLVM_DUMP_METHOD void dump() const; + void + print(raw_ostream &OS, + function_ref)> PrintRecordData = nullptr) const; + +public: + struct ConstValueProxy { + ConstValueProxy() = default; + ConstValueProxy(ArrayRef Hash, ArrayRef Data) + : Hash(Hash), Data(Data) {} + ConstValueProxy(ArrayRef Hash, StringRef Data) + : Hash(Hash), Data(Data.begin(), Data.size()) {} + + ArrayRef Hash; + ArrayRef Data; + }; + + struct ValueProxy { + operator ConstValueProxy() const { return ConstValueProxy(Hash, Data); } + + ValueProxy() = default; + ValueProxy(ArrayRef Hash, MutableArrayRef Data) + : Hash(Hash), Data(Data) {} + + ArrayRef Hash; + MutableArrayRef Data; + }; + +private: + struct HintT { + explicit operator ValueProxy() const { + ValueProxy Value; + Value.Data = MutableArrayRef( + const_cast(reinterpret_cast(P)), I); + Value.Hash = ArrayRef(nullptr, B); + return Value; + } + + explicit HintT(ConstValueProxy Value) + : P(Value.Data.data()), I(Value.Data.size()), B(Value.Hash.size()) { + // Spot-check that this really was a hint. + assert(Value.Data.size() <= UINT16_MAX); + assert(Value.Hash.size() <= UINT16_MAX); + assert(Value.Hash.data() == nullptr); + } + + HintT(const void *P, uint16_t I, uint16_t B) : P(P), I(I), B(B) {} + + const void *P = nullptr; + uint16_t I = 0; + uint16_t B = 0; + }; + +public: + template class PointerImpl { + public: + FileOffset getOffset() const { + return FileOffset(OffsetLow32 | (uint64_t)OffsetHigh16 << 32); + } + + explicit operator bool() const { return IsValue; } + + const ProxyT &operator*() const { + assert(IsValue); + return ValueOrHint; + } + const ProxyT *operator->() const { + assert(IsValue); + return &ValueOrHint; + } + + PointerImpl() = default; + + protected: + PointerImpl(FileOffset Offset, ProxyT Value) + : PointerImpl(Value, Offset, /*IsHint=*/false, /*IsValue=*/true) {} + + explicit PointerImpl(FileOffset Offset, HintT H) + : PointerImpl(ValueProxy(H), Offset, /*IsHint=*/true, + /*IsValue=*/false) {} + + PointerImpl(ProxyT ValueOrHint, FileOffset Offset, bool IsHint, + bool IsValue) + : ValueOrHint(ValueOrHint), OffsetLow32((uint64_t)Offset.get()), + OffsetHigh16((uint64_t)Offset.get() >> 32), IsHint(IsHint), + IsValue(IsValue) { + checkOffset(Offset); + } + + static void checkOffset(FileOffset Offset) { + assert(Offset.get() > 0); + assert((uint64_t)Offset.get() < (1LL << 48)); + } + + std::optional getHint(const OnDiskHashMappedTrie &This) const { + if (!IsHint) + return std::nullopt; + HintT H(ValueOrHint); + assert(H.P == &This && "Expected hint to be for This"); + if (H.P != &This) + return std::nullopt; + return H; + } + + ProxyT ValueOrHint; + uint32_t OffsetLow32 = 0; + uint16_t OffsetHigh16 = 0; + bool IsHint = false; + bool IsValue = false; + }; + + class pointer; + class const_pointer : public PointerImpl { + public: + const_pointer() = default; + + private: + friend class pointer; + friend class OnDiskHashMappedTrie; + using const_pointer::PointerImpl::PointerImpl; + }; + + class pointer : public PointerImpl { + public: + operator const_pointer() const { + return const_pointer(ValueOrHint, getOffset(), IsHint, IsValue); + } + + pointer() = default; + + private: + friend class OnDiskHashMappedTrie; + using pointer::PointerImpl::PointerImpl; + }; + + pointer getMutablePointer(const_pointer CP) { + if (std::optional H = CP.getHint(*this)) + return pointer(CP.getOffset(), *H); + if (!CP) + return pointer(); + ValueProxy V{CP->Hash, MutableArrayRef(const_cast(CP->Data.data()), + CP->Data.size())}; + return pointer(CP.getOffset(), V); + } + + const_pointer find(ArrayRef Hash) const; + pointer find(ArrayRef Hash) { + return getMutablePointer( + const_cast(this)->find(Hash)); + } + + const_pointer recoverFromHashPointer(const uint8_t *HashBegin) const; + pointer recoverFromHashPointer(const uint8_t *HashBegin) { + return getMutablePointer( + const_cast(this)->recoverFromHashPointer( + HashBegin)); + } + + const_pointer recoverFromFileOffset(FileOffset Offset) const; + pointer recoverFromFileOffset(FileOffset Offset) { + return getMutablePointer( + const_cast(this)->recoverFromFileOffset( + Offset)); + } + + using LazyInsertOnConstructCB = + function_ref; + using LazyInsertOnLeakCB = + function_ref; + + /// Insert lazily. + /// + /// \p OnConstruct is called when ready to insert a value, after allocating + /// space for the data. It is called at most once. + /// + /// \p OnLeak is called only if \p OnConstruct has been called and a race + /// occurred before insertion, causing the tentative offset and data to be + /// abandoned. This allows clients to clean up other results or update any + /// references. + /// + /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success. + /// The in-memory \a HashMappedTrie uses LazyAtomicPointer to synchronize + /// simultaneous writes, but that seems dangerous to use in a memory-mapped + /// file in case a process crashes in the busy state. + pointer insertLazy(const_pointer Hint, ArrayRef Hash, + LazyInsertOnConstructCB OnConstruct = nullptr, + LazyInsertOnLeakCB OnLeak = nullptr); + pointer insertLazy(ArrayRef Hash, + LazyInsertOnConstructCB OnConstruct = nullptr, + LazyInsertOnLeakCB OnLeak = nullptr) { + return insertLazy(const_pointer(), Hash, OnConstruct, OnLeak); + } + + pointer insert(const_pointer Hint, const ConstValueProxy &Value) { + return insertLazy(Hint, Value.Hash, [&](FileOffset, ValueProxy Allocated) { + assert(Allocated.Hash == Value.Hash); + assert(Allocated.Data.size() == Value.Data.size()); + llvm::copy(Value.Data, Allocated.Data.begin()); + }); + } + pointer insert(const ConstValueProxy &Value) { + return insert(const_pointer(), Value); + } + + size_t size() const; + + /// Gets or creates a file at \p Path with a hash-mapped trie named \p + /// TrieName. The hash size is \p NumHashBits (in bits) and the records store + /// data of size \p DataSize (in bytes). + /// + /// \p MaxFileSize controls the maximum file size to support, limiting the + /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting + /// size if a new file is created. + /// + /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to + /// configure the trie, if it doesn't already exist. + /// + /// \pre NumHashBits is a multiple of 8 (byte-aligned). + /// + /// TODO: Expose the internal DatabaseFile abstraction and add support for + /// adding more tables to a single file. + /// + /// FIXME: Rename to getOrCreate(). + static Expected + create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, + uint64_t DataSize, uint64_t MaxFileSize, + std::optional NewFileInitialSize, + std::optional NewTableNumRootBits = std::nullopt, + std::optional NewTableNumSubtrieBits = std::nullopt); + + OnDiskHashMappedTrie(OnDiskHashMappedTrie &&RHS); + OnDiskHashMappedTrie &operator=(OnDiskHashMappedTrie &&RHS); + ~OnDiskHashMappedTrie(); + +private: + struct ImplType; + explicit OnDiskHashMappedTrie(std::unique_ptr Impl); + std::unique_ptr Impl; +}; + +/// Sink for data. Stores variable length data with 8-byte alignment. Does not +/// track size of data, which is assumed to known from context, or embedded. +/// Uses 0-padding but does not guarantee 0-termination. +class OnDiskDataAllocator { +public: + using ValueProxy = MutableArrayRef; + + /// An iterator-like return value for data insertion. Maybe it should be + /// called \c iterator, but it has no increment. + class pointer { + public: + FileOffset getOffset() const { return Offset; } + explicit operator bool() const { return bool(getOffset()); } + const ValueProxy &operator*() const { + assert(Offset && "Null dereference"); + return Value; + } + const ValueProxy *operator->() const { + assert(Offset && "Null dereference"); + return &Value; + } + + pointer() = default; + + private: + friend class OnDiskDataAllocator; + pointer(FileOffset Offset, ValueProxy Value) + : Offset(Offset), Value(Value) {} + FileOffset Offset; + ValueProxy Value; + }; + + // Look up the data stored at the given offset. + const char *beginData(FileOffset Offset) const; + char *beginData(FileOffset Offset) { + return const_cast( + const_cast(this)->beginData(Offset)); + } + + pointer allocate(size_t Size); + pointer save(ArrayRef Data) { + pointer P = allocate(Data.size()); + llvm::copy(Data, P->begin()); + return P; + } + pointer save(StringRef Data) { + return save(ArrayRef(Data.begin(), Data.size())); + } + + /// \returns the buffer that was allocated at \p create time, with size + /// \p UserHeaderSize. + MutableArrayRef getUserHeader(); + + size_t size() const; + + static Expected + create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize, + std::optional NewFileInitialSize, + uint32_t UserHeaderSize = 0, + function_ref UserHeaderInit = nullptr); + + OnDiskDataAllocator(OnDiskDataAllocator &&RHS); + OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS); + + // No copy. Just call \a create() again. + OnDiskDataAllocator(const OnDiskDataAllocator &) = delete; + OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete; + + ~OnDiskDataAllocator(); + +private: + struct ImplType; + explicit OnDiskDataAllocator(std::unique_ptr Impl); + std::unique_ptr Impl; +}; + +} // namespace cas +} // namespace llvm + +#endif // LLVM_CAS_ONDISKHASHMAPPEDTRIE_H diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h new file mode 100644 index 00000000000000..a6b4b12491c4dd --- /dev/null +++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h @@ -0,0 +1,63 @@ +//===- OnDiskKeyValueDB.h ---------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_ONDISKKEYVALUEDB_H +#define LLVM_CAS_ONDISKKEYVALUEDB_H + +#include "llvm/CAS/OnDiskHashMappedTrie.h" + +namespace llvm::cas::ondisk { + +/// An on-disk key-value data store with the following properties: +/// * Keys are fixed length binary hashes with expected normal distribution. +/// * Values are buffers of the same size, specified at creation time. +/// * The value of a key cannot be changed once it is set. +/// * The value buffers returned from a key lookup have 8-byte alignment. +class OnDiskKeyValueDB { +public: + /// Associate a value with a key. + /// + /// \param Key the hash bytes for the key + /// \param Value the value bytes, same size as \p ValueSize parameter of + /// \p open call. + /// + /// \returns the value associated with the \p Key. It may be different than + /// \p Value if another value is already associated with this key. + Expected> put(ArrayRef Key, ArrayRef Value); + + /// \returns the value associated with the \p Key, or \p std::nullopt if the + /// key does not exist. + Expected>> get(ArrayRef Key); + + /// \returns Total size of stored data. + size_t getStorageSize() const { return Cache.size(); } + + /// Open the on-disk store from a directory. + /// + /// \param Path directory for the on-disk store. The directory will be created + /// if it doesn't exist. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param KeySize Size for the key hash bytes. + /// \param ValueName Identifier name for the values. + /// \param ValueSize Size for the value bytes. + static Expected> + open(StringRef Path, StringRef HashName, unsigned KeySize, + StringRef ValueName, size_t ValueSize); + +private: + OnDiskKeyValueDB(size_t ValueSize, OnDiskHashMappedTrie Cache) + : ValueSize(ValueSize), Cache(std::move(Cache)) {} + + const size_t ValueSize; + OnDiskHashMappedTrie Cache; +}; + +} // namespace llvm::cas::ondisk + +#endif // LLVM_CAS_ONDISKKEYVALUEDB_H diff --git a/llvm/include/llvm/CAS/PluginCAS.h b/llvm/include/llvm/CAS/PluginCAS.h new file mode 100644 index 00000000000000..12c765b7d0b273 --- /dev/null +++ b/llvm/include/llvm/CAS/PluginCAS.h @@ -0,0 +1,28 @@ +//===- llvm/CAS/PluginCAS.h -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/Error.h" + +#ifndef LLVM_CAS_PLUGINCAS_H +#define LLVM_CAS_PLUGINCAS_H + +namespace llvm::cas { + +/// Create \c ObjectStore and \c ActionCache instances using the plugin +/// interface. +Expected, std::shared_ptr>> +createPluginCASDatabases( + StringRef PluginPath, StringRef OnDiskPath, + ArrayRef> PluginArgs); + +} // namespace llvm::cas + +#endif diff --git a/llvm/include/llvm/CAS/TreeEntry.h b/llvm/include/llvm/CAS/TreeEntry.h new file mode 100644 index 00000000000000..ab50986cf482d2 --- /dev/null +++ b/llvm/include/llvm/CAS/TreeEntry.h @@ -0,0 +1,71 @@ +//===- llvm/CAS/TreeEntry.h -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_TREEENTRY_H +#define LLVM_CAS_TREEENTRY_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/CASReference.h" + +namespace llvm::cas { + +class ObjectStore; + +class TreeEntry { +public: + enum EntryKind { + Regular, /// A file. + Executable, /// A file that's executable. + Symlink, /// A symbolic link. + Tree, /// A filesystem tree. + }; + + EntryKind getKind() const { return Kind; } + bool isRegular() const { return Kind == Regular; } + bool isExecutable() const { return Kind == Executable; } + bool isFile() const { return isRegular() || isExecutable(); } + bool isSymlink() const { return Kind == Symlink; } + bool isTree() const { return Kind == Tree; } + + ObjectRef getRef() const { return Ref; } + + friend bool operator==(const TreeEntry &LHS, const TreeEntry &RHS) { + return LHS.Kind == RHS.Kind && LHS.Ref == RHS.Ref; + } + + TreeEntry(ObjectRef Ref, EntryKind Kind) : Kind(Kind), Ref(Ref) {} + +private: + EntryKind Kind; + ObjectRef Ref; +}; + +class NamedTreeEntry : public TreeEntry { +public: + StringRef getName() const { return Name; } + + friend bool operator==(const NamedTreeEntry &LHS, const NamedTreeEntry &RHS) { + return static_cast(LHS) == RHS && LHS.Name == RHS.Name; + } + + friend bool operator<(const NamedTreeEntry &LHS, const NamedTreeEntry &RHS) { + return LHS.Name < RHS.Name; + } + + NamedTreeEntry(ObjectRef Ref, EntryKind Kind, StringRef Name) + : TreeEntry(Ref, Kind), Name(Name) {} + + void print(raw_ostream &OS, ObjectStore &CAS) const; + +private: + StringRef Name; +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_TREEENTRY_H diff --git a/llvm/include/llvm/CAS/TreeSchema.h b/llvm/include/llvm/CAS/TreeSchema.h new file mode 100644 index 00000000000000..ff796bc8c215f9 --- /dev/null +++ b/llvm/include/llvm/CAS/TreeSchema.h @@ -0,0 +1,125 @@ +//===- llvm/CAS/TreeSchema.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_TREESCHEMA_H +#define LLVM_CAS_TREESCHEMA_H + +#include "llvm/CAS/CASNodeSchema.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/CAS/TreeEntry.h" + +namespace llvm::cas { + +class TreeProxy; + +class TreeSchema : public RTTIExtends { + void anchor() override; + +public: + static char ID; + bool isRootNode(const ObjectProxy &Node) const final { + return false; // TreeSchema doesn't have a root node. + } + bool isNode(const ObjectProxy &Node) const final; + + TreeSchema(ObjectStore &CAS); + + size_t getNumTreeEntries(TreeProxy Tree) const; + + Error + forEachTreeEntry(TreeProxy Tree, + function_ref Callback) const; + + /// Visit each file entry in order, returning an error from \p Callback to + /// stop early. + /// + /// The \p NamedTreeEntry, that the \p Callback receives, points to a name + /// string that may not live beyond the return of the callback function. + /// + /// Passes the \p TreeNodeProxy if the entry is a \p TreeEntry::Tree, + /// otherwise passes \p None. + Error walkFileTreeRecursively( + ObjectStore &CAS, ObjectRef Root, + function_ref)> + Callback); + + std::optional lookupTreeEntry(TreeProxy Tree, StringRef Name) const; + NamedTreeEntry loadTreeEntry(TreeProxy Tree, size_t I) const; + + Expected load(ObjectRef Object) const; + Expected load(ObjectProxy Object) const; + + Expected create(ArrayRef Entries = std::nullopt); + +private: + static constexpr StringLiteral SchemaName = "llvm::cas::schema::tree::v1"; + std::optional TreeKindRef; + + friend class TreeProxy; + + ObjectRef getKindRef() const; +}; + +class TreeProxy : public ObjectProxy { +public: + static Expected get(const TreeSchema &Schema, + Expected Ref); + + static Expected create(TreeSchema &Schema, + ArrayRef Entries); + + const TreeSchema &getSchema() const { return *Schema; } + + bool operator==(const TreeProxy &RHS) const { + return Schema == RHS.Schema && cas::CASID(*this) == cas::CASID(RHS); + } + + Error + forEachEntry(function_ref Callback) const { + return Schema->forEachTreeEntry(*this, Callback); + } + + bool empty() const { return size() == 0; } + size_t size() const { return Schema->getNumTreeEntries(*this); } + + std::optional lookup(StringRef Name) const { + if (auto I = Schema->lookupTreeEntry(*this, Name)) + return get(*I); + return std::nullopt; + } + + StringRef getName(size_t I) const; + + NamedTreeEntry get(size_t I) const { return Schema->loadTreeEntry(*this, I); } + + TreeProxy() = delete; + +private: + TreeProxy(const TreeSchema &Schema, const ObjectProxy &Node) + : ObjectProxy(Node), Schema(&Schema) {} + + class Builder { + public: + static Expected startNode(TreeSchema &Schema); + + Expected build(ArrayRef Entries); + + private: + Builder(const TreeSchema &Schema) : Schema(&Schema) {} + const TreeSchema *Schema; + + public: + SmallString<256> Data; + SmallVector Refs; + }; + const TreeSchema *Schema; +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_TREESCHEMA_H diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h new file mode 100644 index 00000000000000..9c076cdf5fd6e3 --- /dev/null +++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h @@ -0,0 +1,140 @@ +//===- UnifiedOnDiskCache.h -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H +#define LLVM_CAS_UNIFIEDONDISKCACHE_H + +#include "llvm/CAS/OnDiskGraphDB.h" + +namespace llvm::cas::ondisk { + +class OnDiskKeyValueDB; + +/// A unified CAS nodes and key-value database, using on-disk storage for both. +/// It manages storage growth and provides APIs for garbage collection. +/// +/// High-level properties: +/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the +/// storage size in that directory will keep growing unrestricted. For data to +/// become eligible for garbase-collection there should be no open instances +/// of \p UnifiedOnDiskCache for that directory, by any process. +/// * Garbage-collection needs to be triggered explicitly by the client. It can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers, in the same process or other +/// processes. +/// +/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open +/// for a limited period of time, e.g. for the duration of a build operation. +/// For long-living processes that need periodic access to a +/// \p UnifiedOnDiskCache, the client should device a scheme where access is +/// performed within some defined period. For example, if a service is designed +/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it +/// could keep the instance alive while new requests are coming in but close it +/// after a time period in which there are no new requests. +class UnifiedOnDiskCache { +public: + /// The \p OnDiskGraphDB instance for the open directory. + OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; } + + /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key. + /// + /// \param Key the hash bytes for the key. + /// \param Value the \p ObjectID value. + /// + /// \returns the \p ObjectID associated with the \p Key. It may be different + /// than \p Value if another value was already associated with this key. + Expected KVPut(ArrayRef Key, ObjectID Value); + + /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key. + /// An \p ObjectID as a key is equivalent to its digest bytes. + /// + /// \param Key the \p ObjectID for the key. + /// \param Value the \p ObjectID value. + /// + /// \returns the \p ObjectID associated with the \p Key. It may be different + /// than \p Value if another value was already associated with this key. + Expected KVPut(ObjectID Key, ObjectID Value); + + /// \returns the \p ObjectID, of the \p OnDiskGraphDB instance, associated + /// with the \p Key, or \p std::nullopt if the key does not exist. + Expected> KVGet(ArrayRef Key); + + /// Open a \p UnifiedOnDiskCache instance for a directory. + /// + /// \param Path directory for the on-disk database. The directory will be + /// created if it doesn't exist. + /// \param SizeLimit Optional size for limiting growth. This has an effect for + /// when the instance is closed. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param FaultInPolicy Controls how nodes are copied to primary store. This + /// is recorded at creation time and subsequent opens need to pass the same + /// policy otherwise the \p open will fail. + static Expected> + open(StringRef Path, std::optional SizeLimit, StringRef HashName, + unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy = + OnDiskGraphDB::FaultInPolicy::FullTree); + + /// This is called implicitly at destruction time, so it is not required for a + /// client to call this. After calling \p close the only method that is valid + /// to call is \p needsGarbaseCollection. + /// + /// \param CheckSizeLimit if true it will check whether the primary store has + /// exceeded its intended size limit. If false the check is skipped even if a + /// \p SizeLimit was passed to the \p open call. + Error close(bool CheckSizeLimit = true); + + /// \returns whether the primary store has exceeded the intended size limit. + /// This can return false even if the overall size of the opened directory is + /// over the \p SizeLimit passed to \p open. To know whether garbage + /// collection needs to be triggered or not, call \p needsGarbaseCollection. + bool hasExceededSizeLimit() const; + + /// \returns whether there are unused data that can be deleted using a + /// \p collectGarbage call. + bool needsGarbaseCollection() const { return NeedsGarbageCollection; } + + /// Remove any unused data from the directory at \p Path. If there are no such + /// data the operation is a no-op. + /// + /// This can be called concurrently, regardless of whether there is an open + /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers + /// in the same process or other processes. + /// + /// It is recommended that garbage-collection is triggered concurrently in the + /// background, so that it has minimal effect on the workload of the process. + static Error collectGarbage(StringRef Path); + + ~UnifiedOnDiskCache(); + +private: + UnifiedOnDiskCache(); + + Expected> + faultInFromUpstreamKV(ArrayRef Key); + + std::string RootPath; + std::optional SizeLimit; + + int LockFD = -1; + + std::atomic NeedsGarbageCollection; + std::string PrimaryDBDir; + + OnDiskGraphDB *UpstreamGraphDB = nullptr; + std::unique_ptr PrimaryGraphDB; + + std::unique_ptr UpstreamKVDB; + std::unique_ptr PrimaryKVDB; +}; + +} // namespace llvm::cas::ondisk + +#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h index 9cf53360b4e966..38ad0e712b32ed 100644 --- a/llvm/include/llvm/Support/FileSystem.h +++ b/llvm/include/llvm/Support/FileSystem.h @@ -1184,12 +1184,16 @@ openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None, /// descriptor. std::error_code tryLockFile(int FD, - std::chrono::milliseconds Timeout = std::chrono::milliseconds(0)); + std::chrono::milliseconds Timeout = std::chrono::milliseconds(0), + bool Exclusive = true); /// Lock the file. /// /// This function acts as @ref tryLockFile but it waits infinitely. -std::error_code lockFile(int FD); +/// \param FD file descriptor to use for locking. +/// \param Exclusive if \p true use exclusive/writer lock, otherwise use +/// shared/reader lock. +std::error_code lockFile(int FD, bool Exclusive = true); /// Unlock the file. /// diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap index b00da6d7cd28c7..d44d395fa8ef46 100644 --- a/llvm/include/module.modulemap +++ b/llvm/include/module.modulemap @@ -105,6 +105,12 @@ module LLVM_BinaryFormat { textual header "llvm/BinaryFormat/MsgPack.def" } +module LLVM_CAS { + requires cplusplus + umbrella "llvm/CAS" + module * { export * } +} + module LLVM_Config { requires cplusplus umbrella "llvm/Config" diff --git a/llvm/lib/CAS/ActionCache.cpp b/llvm/lib/CAS/ActionCache.cpp new file mode 100644 index 00000000000000..b7d83ebf1722c0 --- /dev/null +++ b/llvm/lib/CAS/ActionCache.cpp @@ -0,0 +1,64 @@ +//===- ActionCache.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/CASID.h" +#include "llvm/CAS/ObjectStore.h" + +using namespace llvm; +using namespace llvm::cas; + +void ActionCache::anchor() {} + +CacheKey::CacheKey(const CASID &ID) : Key(toStringRef(ID.getHash()).str()) {} +CacheKey::CacheKey(const ObjectProxy &Proxy) + : CacheKey(Proxy.getCAS(), Proxy.getRef()) {} +CacheKey::CacheKey(const ObjectStore &CAS, const ObjectRef &Ref) + : Key(toStringRef(CAS.getID(Ref).getHash())) {} + +#ifndef _MSC_VER +/// FIXME: MSVC doesn't compile Error within Promise/Future correctly and will +/// result in unchecked error. Disable AsyncAPIs when using MSVC for now. +std::future ActionCache::getFuture(const CacheKey &ActionKey, + bool Globally) const { + std::promise Promise; + auto Future = Promise.get_future(); + getAsync(ActionKey, Globally, + [Promise = + std::move(Promise)](Expected> ID) mutable { + Promise.set_value(std::move(ID)); + }); + return Future; +} + +std::future ActionCache::putFuture(const CacheKey &ActionKey, + const CASID &Result, + bool Globally) { + std::promise Promise; + auto Future = Promise.get_future(); + putAsync(ActionKey, Result, Globally, + [Promise = std::move(Promise)](Error E) mutable { + Promise.set_value(std::move(E)); + }); + return Future; +} +#endif + +void ActionCache::getImplAsync( + ArrayRef ResolvedKey, bool Globally, + unique_function>)> Callback) const { + // The default implementation is synchronous. + return Callback(getImpl(ResolvedKey, Globally)); +} + +void ActionCache::putImplAsync(ArrayRef ResolvedKey, + const CASID &Result, bool Globally, + unique_function Callback) { + // The default implementation is synchronous. + return Callback(putImpl(ResolvedKey, Result, Globally)); +} diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp new file mode 100644 index 00000000000000..ff4f3c637a46e1 --- /dev/null +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -0,0 +1,242 @@ +//===- ActionCaches.cpp -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/ADT/TrieRawHashMap.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskHashMappedTrie.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Path.h" + +#define DEBUG_TYPE "action-caches" + +using namespace llvm; +using namespace llvm::cas; + +namespace { + +using HasherT = BLAKE3; +using HashType = decltype(HasherT::hash(std::declval &>())); + +template class CacheEntry { +public: + CacheEntry() = default; + CacheEntry(ArrayRef Hash) { llvm::copy(Hash, Value.data()); } + CacheEntry(const CacheEntry &Entry) { llvm::copy(Entry.Value, Value.data()); } + ArrayRef getValue() const { return Value; } + +private: + std::array Value; +}; + +class InMemoryActionCache final : public ActionCache { +public: + InMemoryActionCache() + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()) {} + + Error putImpl(ArrayRef ActionKey, const CASID &Result, + bool Globally) final; + Expected> getImpl(ArrayRef ActionKey, + bool Globally) const final; + +private: + using DataT = CacheEntry; + using InMemoryCacheT = ThreadSafeTrieRawHashMap; + + InMemoryCacheT Cache; +}; + +class OnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef ActionKey, const CASID &Result, + bool Globally) final; + Expected> getImpl(ArrayRef ActionKey, + bool Globally) const final; + + static Expected> create(StringRef Path); + +private: + static StringRef getHashName() { return "BLAKE3"; } + + OnDiskActionCache(std::unique_ptr DB); + + std::unique_ptr DB; + using DataT = CacheEntry; +}; + +class UnifiedOnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef ActionKey, const CASID &Result, + bool Globally) final; + Expected> getImpl(ArrayRef ActionKey, + bool Globally) const final; + + UnifiedOnDiskActionCache(std::shared_ptr UniDB); + +private: + std::shared_ptr UniDB; +}; +} // end namespace + +static std::string hashToString(ArrayRef Hash) { + SmallString<64> Str; + toHex(Hash, /*LowerCase=*/true, Str); + return Str.str().str(); +} + +static Error createResultCachePoisonedError(StringRef Key, + const CASContext &Context, + CASID Output, + ArrayRef ExistingOutput) { + std::string Existing = + CASID::create(&Context, toStringRef(ExistingOutput)).toString(); + return createStringError(std::make_error_code(std::errc::invalid_argument), + "cache poisoned for '" + Key + "' (new='" + + Output.toString() + "' vs. existing '" + + Existing + "')"); +} + +Expected> +InMemoryActionCache::getImpl(ArrayRef Key, bool /*Globally*/) const { + auto Result = Cache.find(Key); + if (!Result) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(Result->Data.getValue())); +} + +Error InMemoryActionCache::putImpl(ArrayRef Key, const CASID &Result, + bool /*Globally*/) { + DataT Expected(Result.getHash()); + const InMemoryCacheT::value_type &Cached = *Cache.insertLazy( + Key, [&](auto ValueConstructor) { ValueConstructor.emplace(Expected); }); + + const DataT &Observed = Cached.Data; + if (Expected.getValue() == Observed.getValue()) + return Error::success(); + + return createResultCachePoisonedError(hashToString(Key), getContext(), Result, + Observed.getValue()); +} + +static constexpr StringLiteral DefaultName = "actioncache"; + +namespace llvm { +namespace cas { + +std::string getDefaultOnDiskActionCachePath() { + SmallString<128> Path; + if (!llvm::sys::path::cache_directory(Path)) + report_fatal_error("cannot get default cache directory"); + llvm::sys::path::append(Path, builtin::DefaultDir, DefaultName); + return Path.str().str(); +} + +std::unique_ptr createInMemoryActionCache() { + return std::make_unique(); +} + +} // namespace cas +} // namespace llvm + +OnDiskActionCache::OnDiskActionCache( + std::unique_ptr DB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + DB(std::move(DB)) {} + +Expected> +OnDiskActionCache::create(StringRef AbsPath) { + std::unique_ptr DB; + if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(), + sizeof(HashType), getHashName(), + sizeof(DataT)) + .moveInto(DB)) + return std::move(E); + return std::unique_ptr( + new OnDiskActionCache(std::move(DB))); +} + +Expected> +OnDiskActionCache::getImpl(ArrayRef Key, bool /*Globally*/) const { + std::optional> Val; + if (Error E = DB->get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(*Val)); +} + +Error OnDiskActionCache::putImpl(ArrayRef Key, const CASID &Result, + bool /*Globally*/) { + auto ResultHash = Result.getHash(); + ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size()); + ArrayRef Observed; + if (Error E = DB->put(Key, Expected).moveInto(Observed)) + return E; + + if (Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + hashToString(Key), getContext(), Result, + ArrayRef((const uint8_t *)Observed.data(), Observed.size())); +} + +UnifiedOnDiskActionCache::UnifiedOnDiskActionCache( + std::shared_ptr UniDB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + UniDB(std::move(UniDB)) {} + +Expected> +UnifiedOnDiskActionCache::getImpl(ArrayRef Key, + bool /*Globally*/) const { + std::optional Val; + if (Error E = UniDB->KVGet(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), + toStringRef(UniDB->getGraphDB().getDigest(*Val))); +} + +Error UnifiedOnDiskActionCache::putImpl(ArrayRef Key, + const CASID &Result, + bool /*Globally*/) { + ondisk::ObjectID Expected = + UniDB->getGraphDB().getReference(Result.getHash()); + std::optional Observed; + if (Error E = UniDB->KVPut(Key, Expected).moveInto(Observed)) + return E; + + if (Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + hashToString(Key), getContext(), Result, + UniDB->getGraphDB().getDigest(*Observed)); +} + +Expected> +cas::createOnDiskActionCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return OnDiskActionCache::create(Path); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} + +std::unique_ptr +cas::builtin::createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr UniDB) { + return std::make_unique(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp new file mode 100644 index 00000000000000..41e273cee1ba77 --- /dev/null +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -0,0 +1,108 @@ +//===- BuiltinCAS.cpp -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Process.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +static StringRef getCASIDPrefix() { return "llvmcas://"; } +void BuiltinCASContext::anchor() {} + +Expected BuiltinCASContext::parseID(StringRef Reference) { + if (!Reference.consume_front(getCASIDPrefix())) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "invalid cas-id '" + Reference + "'"); + + // FIXME: Allow shortened references? + if (Reference.size() != 2 * sizeof(HashType)) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "wrong size for cas-id hash '" + Reference + "'"); + + std::string Binary; + if (!tryGetFromHex(Reference, Binary)) + return createStringError(std::make_error_code(std::errc::invalid_argument), + "invalid hash in cas-id '" + Reference + "'"); + + assert(Binary.size() == sizeof(HashType)); + HashType Digest; + llvm::copy(Binary, Digest.data()); + return Digest; +} + +Expected BuiltinCAS::parseID(StringRef Reference) { + Expected Digest = BuiltinCASContext::parseID(Reference); + if (!Digest) + return Digest.takeError(); + + return CASID::create(&getContext(), toStringRef(*Digest)); +} + +void BuiltinCASContext::printID(ArrayRef Digest, raw_ostream &OS) { + SmallString<64> Hash; + toHex(Digest, /*LowerCase=*/true, Hash); + OS << getCASIDPrefix() << Hash; +} + +void BuiltinCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const { + BuiltinCASContext::printID(ID.getHash(), OS); +} + +const BuiltinCASContext &BuiltinCASContext::getDefaultContext() { + static BuiltinCASContext DefaultContext; + return DefaultContext; +} + +Expected BuiltinCAS::store(ArrayRef Refs, + ArrayRef Data) { + return storeImpl(BuiltinObjectHasher::hashObject(*this, Refs, Data), + Refs, Data); +} + +Error BuiltinCAS::validate(const CASID &ID) { + auto Ref = getReference(ID); + if (!Ref) + return createUnknownObjectError(ID); + + auto Handle = load(*Ref); + if (!Handle) + return Handle.takeError(); + + auto Proxy = ObjectProxy::load(*this, *Ref, *Handle); + SmallVector Refs; + if (auto E = Proxy.forEachReference([&](ObjectRef Ref) -> Error { + Refs.push_back(Ref); + return Error::success(); + })) + return E; + + ArrayRef Data(Proxy.getData().data(), Proxy.getData().size()); + auto Hash = BuiltinObjectHasher::hashObject(*this, Refs, Data); + if (!ID.getHash().equals(Hash)) + return createCorruptObjectError(ID); + + return Error::success(); +} + +Expected> +cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), + sizeof(HashType)); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h new file mode 100644 index 00000000000000..75f3c92105bbea --- /dev/null +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -0,0 +1,98 @@ +//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CAS_BUILTINCAS_H +#define LLVM_LIB_CAS_BUILTINCAS_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/ObjectStore.h" +#include + +namespace llvm { +namespace cas { +class ActionCache; +namespace ondisk { +class UnifiedOnDiskCache; +} +namespace builtin { + +class BuiltinCAS : public ObjectStore { +public: + BuiltinCAS() : ObjectStore(BuiltinCASContext::getDefaultContext()) {} + + Expected parseID(StringRef Reference) final; + + Expected store(ArrayRef Refs, + ArrayRef Data) final; + virtual Expected storeImpl(ArrayRef ComputedHash, + ArrayRef Refs, + ArrayRef Data) = 0; + + virtual Expected + storeFromNullTerminatedRegion(ArrayRef ComputedHash, + sys::fs::mapped_file_region Map) { + return storeImpl(ComputedHash, std::nullopt, + ArrayRef(Map.data(), Map.size())); + } + + /// Both builtin CAS implementations provide lifetime for free, so this can + /// be const, and readData() and getDataSize() can be implemented on top of + /// it. + virtual ArrayRef getDataConst(ObjectHandle Node) const = 0; + + ArrayRef getData(ObjectHandle Node, + bool RequiresNullTerminator) const final { + // BuiltinCAS Objects are always null terminated. + return getDataConst(Node); + } + uint64_t getDataSize(ObjectHandle Node) const final { + return getDataConst(Node).size(); + } + + Error createUnknownObjectError(const CASID &ID) const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "unknown object '" + ID.toString() + "'"); + } + + Error createCorruptObjectError(const CASID &ID) const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "corrupt object '" + ID.toString() + "'"); + } + + Error createCorruptStorageError() const { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "corrupt storage"); + } + + Error validate(const CASID &ID) final; +}; + +/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing. +Expected> +createBuiltinUnifiedOnDiskCache(StringRef Path); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr UniDB); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr UniDB); + +// FIXME: Proxy not portable. Maybe also error-prone? +constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default"; +constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default"; + +} // end namespace builtin +} // end namespace cas +} // end namespace llvm + +#endif // LLVM_LIB_CAS_BUILTINCAS_H diff --git a/llvm/lib/CAS/BuiltinObjectHasher.h b/llvm/lib/CAS/BuiltinObjectHasher.h new file mode 100644 index 00000000000000..e9d7f7d887515f --- /dev/null +++ b/llvm/lib/CAS/BuiltinObjectHasher.h @@ -0,0 +1,73 @@ +//===- BuiltinObjectHasher.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINOBJECTHASHER_H +#define LLVM_CAS_BUILTINOBJECTHASHER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/Endian.h" + +namespace llvm { +namespace cas { + +template class BuiltinObjectHasher { +public: + using HashT = decltype(HasherT::hash(std::declval &>())); + + static HashT hashObject(const ObjectStore &CAS, ArrayRef Refs, + ArrayRef Data) { + BuiltinObjectHasher H; + H.updateSize(Refs.size()); + for (const ObjectRef &Ref : Refs) + H.updateRef(CAS, Ref); + H.updateArray(Data); + return H.finish(); + } + +private: + HashT finish() { return Hasher.final(); } + + void updateRef(const ObjectStore &CAS, ObjectRef Ref) { + updateID(CAS.getID(Ref)); + } + + void updateID(const CASID &ID) { + // NOTE: Does not hash the size of the hash. That's a CAS implementation + // detail that shouldn't leak into the UUID for an object. + ArrayRef Hash = ID.getHash(); + assert(Hash.size() == sizeof(HashT) && + "Expected object ref to match the hash size"); + Hasher.update(Hash); + } + + void updateArray(ArrayRef Bytes) { + updateSize(Bytes.size()); + Hasher.update(Bytes); + } + + void updateArray(ArrayRef Bytes) { + updateArray(ArrayRef(reinterpret_cast(Bytes.data()), + Bytes.size())); + } + + void updateSize(uint64_t Size) { + Size = support::endian::byte_swap(Size, support::endianness::little); + Hasher.update( + ArrayRef(reinterpret_cast(&Size), sizeof(Size))); + } + + BuiltinObjectHasher() = default; + ~BuiltinObjectHasher() = default; + HasherT Hasher; +}; + +} // namespace cas +} // namespace llvm + +#endif // LLVM_CAS_BUILTINOBJECTHASHER_H diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp new file mode 100644 index 00000000000000..87073cf2b4f230 --- /dev/null +++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp @@ -0,0 +1,25 @@ +//===- BuiltinUnifiedCASDatabases.cpp ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "BuiltinCAS.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" + +using namespace llvm; +using namespace llvm::cas; + +Expected, std::unique_ptr>> +cas::createOnDiskUnifiedCASDatabases(StringRef Path) { + std::shared_ptr UniDB; + if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB)) + return std::move(E); + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB)); + return std::make_pair(std::move(CAS), std::move(AC)); +} diff --git a/llvm/lib/CAS/CASNodeSchema.cpp b/llvm/lib/CAS/CASNodeSchema.cpp new file mode 100644 index 00000000000000..0ef47f7cc33ef6 --- /dev/null +++ b/llvm/lib/CAS/CASNodeSchema.cpp @@ -0,0 +1,23 @@ +//===- CASNodeSchema.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/CASNodeSchema.h" +#include "llvm/CAS/ObjectStore.h" + +using namespace llvm; +using namespace llvm::cas; + +char NodeSchema::ID = 0; +void NodeSchema::anchor() {} + +NodeSchema *SchemaPool::getSchemaForRoot(cas::ObjectProxy Node) const { + for (auto &Schema : Schemas) + if (Schema->isRootNode(Node)) + return Schema.get(); + return nullptr; +} diff --git a/llvm/lib/CAS/CASRegistry.cpp b/llvm/lib/CAS/CASRegistry.cpp new file mode 100644 index 00000000000000..04137b3b242842 --- /dev/null +++ b/llvm/lib/CAS/CASRegistry.cpp @@ -0,0 +1,103 @@ +//===- CASRegistry.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/CASRegistry.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FunctionExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/CAS/PluginCAS.h" +#include "llvm/Support/ManagedStatic.h" + +using namespace llvm; +using namespace llvm::cas; + +static Expected< + std::pair, std::shared_ptr>> +createOnDiskCASImpl(const Twine &Path) { + std::string CASPath = Path.str(); + // If path is empty, use default ondisk CAS path. + if (CASPath.empty()) + CASPath = getDefaultOnDiskCASPath(); + + auto UniDB = createOnDiskUnifiedCASDatabases(Path.str()); + if (!UniDB) + return UniDB.takeError(); + + return std::pair{std::move(UniDB->first), std::move(UniDB->second)}; +} + +static Expected< + std::pair, std::shared_ptr>> +createPluginCASImpl(const Twine &URL) { + // Format used is + // plugin://${PATH_TO_PLUGIN}?${OPT1}=${VAL1}&${OPT2}=${VAL2}.. + // "ondisk-path" as option is treated specially, the rest of options are + // passed to the plugin verbatim. + SmallString<256> PathBuf; + auto [PluginPath, Options] = URL.toStringRef(PathBuf).split('?'); + std::string OnDiskPath; + SmallVector> PluginArgs; + while (!Options.empty()) { + StringRef Opt; + std::tie(Opt, Options) = Options.split('&'); + auto [Name, Value] = Opt.split('='); + if (Name == "ondisk-path") { + OnDiskPath = Value; + } else { + PluginArgs.push_back({std::string(Name), std::string(Value)}); + } + } + + if (OnDiskPath.empty()) + OnDiskPath = getDefaultOnDiskCASPath(); + + return createPluginCASDatabases(PluginPath, OnDiskPath, PluginArgs); +} + +static Expected< + std::pair, std::shared_ptr>> +createInMemoryCASImpl(const Twine &) { + return std::pair{createInMemoryCAS(), createInMemoryActionCache()}; +} + +static ManagedStatic> RegisteredScheme; + +static StringMap &getRegisteredScheme() { + if (!RegisteredScheme.isConstructed()) { + RegisteredScheme->insert({"mem://", &createInMemoryCASImpl}); + RegisteredScheme->insert({"file://", &createOnDiskCASImpl}); + RegisteredScheme->insert({"plugin://", &createPluginCASImpl}); + } + return *RegisteredScheme; +} + +Expected, std::shared_ptr>> +cas::createCASFromIdentifier(StringRef Id) { + for (auto &Scheme : getRegisteredScheme()) { + if (Id.consume_front(Scheme.getKey())) + return Scheme.getValue()(Id); + } + + return createStringError(std::make_error_code(std::errc::invalid_argument), + "Unknown CAS identifier is provided"); +} + +bool cas::isRegisteredCASIdentifier(StringRef Id) { + for (auto &Scheme : getRegisteredScheme()) { + if (Id.consume_front(Scheme.getKey())) + return true; + } + return false; +} + +void cas::registerCASURLScheme(StringRef Prefix, + ObjectStoreCreateFuncTy *Func) { + getRegisteredScheme().insert({Prefix, Func}); +} diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt new file mode 100644 index 00000000000000..e2d192b28498b1 --- /dev/null +++ b/llvm/lib/CAS/CMakeLists.txt @@ -0,0 +1,31 @@ +if (LLVM_ENABLE_ONDISK_CAS) + add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1) +endif() + +add_llvm_component_library(LLVMCAS + ActionCache.cpp + ActionCaches.cpp + BuiltinCAS.cpp + BuiltinUnifiedCASDatabases.cpp + CASNodeSchema.cpp + CASRegistry.cpp + HierarchicalTreeBuilder.cpp + InMemoryCAS.cpp + MappedFileRegionBumpPtr.cpp + ObjectStore.cpp + OnDiskCAS.cpp + OnDiskCommon.cpp + OnDiskGraphDB.cpp + OnDiskHashMappedTrie.cpp + OnDiskKeyValueDB.cpp + PluginCAS.cpp + TreeEntry.cpp + TreeSchema.cpp + UnifiedOnDiskCache.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS + + LINK_COMPONENTS + Support +) diff --git a/llvm/lib/CAS/HashMappedTrieIndexGenerator.h b/llvm/lib/CAS/HashMappedTrieIndexGenerator.h new file mode 100644 index 00000000000000..e021d51a2b3060 --- /dev/null +++ b/llvm/lib/CAS/HashMappedTrieIndexGenerator.h @@ -0,0 +1,90 @@ +//===- HashMappedTrieIndexGenerator.h ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CAS_HASHMAPPEDTRIEINDEXGENERATOR_H +#define LLVM_LIB_CAS_HASHMAPPEDTRIEINDEXGENERATOR_H + +#include "llvm/ADT/ArrayRef.h" + +namespace llvm { +namespace cas { + +struct IndexGenerator { + size_t NumRootBits; + size_t NumSubtrieBits; + ArrayRef Bytes; + std::optional StartBit = std::nullopt; + + size_t getNumBits() const { + assert(StartBit); + size_t TotalNumBits = Bytes.size() * 8; + assert(*StartBit <= TotalNumBits); + return std::min(*StartBit ? NumSubtrieBits : NumRootBits, + TotalNumBits - *StartBit); + } + size_t next() { + size_t Index; + if (!StartBit) { + StartBit = 0; + Index = getIndex(Bytes, *StartBit, NumRootBits); + } else { + *StartBit += *StartBit ? NumSubtrieBits : NumRootBits; + assert((*StartBit - NumRootBits) % NumSubtrieBits == 0); + Index = getIndex(Bytes, *StartBit, NumSubtrieBits); + } + return Index; + } + + size_t hint(unsigned Index, unsigned Bit) { + assert(Index >= 0); + assert(Bit < Bytes.size() * 8); + assert(Bit == 0 || (Bit - NumRootBits) % NumSubtrieBits == 0); + StartBit = Bit; + return Index; + } + + size_t getCollidingBits(ArrayRef CollidingBits) const { + assert(StartBit); + return getIndex(CollidingBits, *StartBit, NumSubtrieBits); + } + + static size_t getIndex(ArrayRef Bytes, size_t StartBit, + size_t NumBits) { + assert(StartBit < Bytes.size() * 8); + + Bytes = Bytes.drop_front(StartBit / 8u); + StartBit %= 8u; + size_t Index = 0; + for (uint8_t Byte : Bytes) { + size_t ByteStart = 0, ByteEnd = 8; + if (StartBit) { + ByteStart = StartBit; + Byte &= (1u << (8 - StartBit)) - 1u; + StartBit = 0; + } + size_t CurrentNumBits = ByteEnd - ByteStart; + if (CurrentNumBits > NumBits) { + Byte >>= CurrentNumBits - NumBits; + CurrentNumBits = NumBits; + } + Index <<= CurrentNumBits; + Index |= Byte & ((1u << CurrentNumBits) - 1u); + + assert(NumBits >= CurrentNumBits); + NumBits -= CurrentNumBits; + if (!NumBits) + break; + } + return Index; + } +}; + +} // namespace cas +} // namespace llvm + +#endif // LLVM_LIB_CAS_HASHMAPPEDTRIEINDEXGENERATOR_H diff --git a/llvm/lib/CAS/HierarchicalTreeBuilder.cpp b/llvm/lib/CAS/HierarchicalTreeBuilder.cpp new file mode 100644 index 00000000000000..0590cee17a1781 --- /dev/null +++ b/llvm/lib/CAS/HierarchicalTreeBuilder.cpp @@ -0,0 +1,266 @@ +//===- HierarchicalTreeBuilder.cpp ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/HierarchicalTreeBuilder.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Path.h" + +using namespace llvm; +using namespace llvm::cas; + +/// Critical to canonicalize components so that paths come up next to each +/// other when sorted. +static StringRef canonicalize(SmallVectorImpl &Path, + TreeEntry::EntryKind Kind) { + // Make absolute. + if (Path.empty() || Path.front() != '/') + Path.insert(Path.begin(), '/'); + + // FIXME: consider rejecting ".." instead of removing them. + sys::path::remove_dots(Path, /*remove_dot_dot=*/true, + sys::path::Style::posix); + + // Canonicalize slashes. + bool PendingSlash = false; + char *NewEnd = Path.begin(); + for (int I = 0, E = Path.size(); I != E; ++I) { + if (Path[I] == '/') { + PendingSlash = true; + continue; + } + if (PendingSlash) + *NewEnd++ = '/'; + PendingSlash = false; + *NewEnd++ = Path[I]; + } + Path.erase(NewEnd, Path.end()); + + // For correct sorting, all explicit trees need to end with a '/'. + if (Path.empty() || Kind == TreeEntry::Tree) + Path.push_back('/'); + return StringRef(Path.begin(), Path.size()); +} + +void HierarchicalTreeBuilder::pushImpl(std::optional Ref, + TreeEntry::EntryKind Kind, + const Twine &Path) { + SmallVector CanonicalPath; + Path.toVector(CanonicalPath); + Entries.emplace_back(Ref, Kind, canonicalize(CanonicalPath, Kind)); +} + +void HierarchicalTreeBuilder::pushTreeContent(ObjectRef Ref, + const Twine &Path) { + SmallVector CanonicalPath; + Path.toVector(CanonicalPath); + TreeEntry::EntryKind Kind = TreeEntry::Tree; + TreeContents.emplace_back(Ref, Kind, canonicalize(CanonicalPath, Kind)); +} + +Expected HierarchicalTreeBuilder::create(ObjectStore &CAS) { + // FIXME: It is inefficient expanding the whole tree recursively like this, + // use a more efficient algorithm to merge contents. + TreeSchema Schema(CAS); + for (const auto &TreeContent : TreeContents) { + StringRef Path = TreeContent.getPath(); + Error E = Schema.walkFileTreeRecursively( + CAS, *TreeContent.getRef(), + [&](const NamedTreeEntry &Entry, + std::optional Tree) -> Error { + if (Entry.getKind() != TreeEntry::Tree) { + pushImpl(Entry.getRef(), Entry.getKind(), Path + Entry.getName()); + return Error::success(); + } + if (Tree->empty()) + pushDirectory(Path + Entry.getName()); + return Error::success(); + }); + if (E) + return std::move(E); + } + TreeContents.clear(); + + if (Entries.empty()) + return Schema.create(); + + std::stable_sort( + Entries.begin(), Entries.end(), + [](const HierarchicalEntry &LHS, const HierarchicalEntry &RHS) { + // Lexicographically smaller paths first. + if (int Compare = LHS.getPath().compare(RHS.getPath())) + return Compare < 0; + + // Nodes with IDs first (only trees may have a missing Ref). + return bool(LHS.getRef()) > bool(RHS.getRef()); + }); + + // Compile into trees. + struct Tree; + struct Node { + Node *Next = nullptr; + Tree *Parent = nullptr; + std::optional Ref; + TreeEntry::EntryKind Kind; + StringRef Name; + + bool isTree() const { return Kind == TreeEntry::Tree; } + }; + struct Tree : Node { + Node *First = nullptr; + bool Visited = false; + }; + + BumpPtrAllocator Alloc; + Tree Root; + const HierarchicalEntry *PrevEntry = nullptr; + for (const HierarchicalEntry &Entry : Entries) { + // Check for duplicates. + if (PrevEntry && PrevEntry->getPath() == Entry.getPath()) { + // Error if it's not identical. + // + // FIXME: Maybe we should allow clobbering / merging / etc., but for now + // just error. + if (Entry.getKind() != PrevEntry->getKind()) + return createStringError( + std::make_error_code(std::errc::invalid_argument), + "duplicate path '" + Entry.getPath() + "' with different kind"); + if (!Entry.getRef()) { + assert(Entry.getKind() == TreeEntry::Tree); + continue; + } + assert(PrevEntry->getRef()); + if (*Entry.getRef() != *PrevEntry->getRef()) + return createStringError( + std::make_error_code(std::errc::invalid_argument), + "duplicate path '" + Entry.getPath() + "' with different ID"); + + // Skip the duplicate. + continue; + } + PrevEntry = &Entry; + + Tree *Current = &Root; + StringRef Path = Entry.getPath(); + { + bool Consumed = Path.consume_front("/"); + (void)Consumed; + assert(Consumed && "Expected canonical POSIX absolute paths"); + } + for (auto Slash = Path.find('/'); !Path.empty(); Slash = Path.find('/')) { + StringRef Name; + if (Slash == StringRef::npos) { + Name = Path; + Path = ""; + } else { + Name = Path.take_front(Slash); + Path = Path.drop_front(Slash + 1); + } + + // If the tree Current already has a ref, then it's fixed and we can't + // add anything to it. + if (Current->Ref) + return createStringError( + std::make_error_code(std::errc::invalid_argument), + "cannot add '" + Entry.getPath() + "' under fixed tree"); + + // Need to canonicalize first, or else the sorting trick doesn't work. + assert(Name != ""); + assert(Name != "/"); + assert(Name != "."); + assert(Name != ".."); + + // Check if it's the first node (sorting ahead of time means it's either + // the first node, or it doesn't exist yet). Also, check for conflicts + // between implied trees and other nodes, such as a blob "/a" and an + // implied tree from "/a/b". + if (Current->First && Name == Current->First->Name) { + if (Path == "" && Entry.getKind() == TreeEntry::Tree) { + // Tree already exists. Sort order should ensure a fixed tree comes + // first. + assert(!Entry.getRef() || + (Current->Ref && *Current->Ref == *Entry.getRef())); + break; + } + if (Current->First->Kind == TreeEntry::Tree) { + // Navigate deeper. + Current = static_cast(Current->First); + continue; + } + + // This is reachable if there are two entries "/duplicate" and + // "/duplicate/suffix". + return createStringError( + std::make_error_code(std::errc::invalid_argument), + "duplicate path '" + + Entry.getPath().take_front(Name.end() - + Entry.getPath().begin()) + + "'"); + } + + // Doesn't exist yet. + Node *New; + Tree *Next = nullptr; + if (Path == "" && Entry.getKind() != TreeEntry::Tree) { + New = new (Alloc.Allocate()) Node(); + } else { + Next = new (Alloc.Allocate()) Tree(); + New = Next; + } + New->Parent = Current; + New->Next = Current->First; + New->Name = Name; + if (Path == "") { + New->Kind = Entry.getKind(); + New->Ref = Entry.getRef(); + } else { + New->Kind = TreeEntry::Tree; + } + Current->First = New; + Current = Next; + } + } + + // Create the trees bottom up. Pre-allocate space for 8 entries, since many + // trees are fairly small when building cache keys. + SmallVector Entries; + SmallVector Worklist = {&Root}; + while (!Worklist.empty()) { + Tree *T = Worklist.back(); + if (!T->Visited) { + assert(!T->Ref && "Trees with fixed content shouldn't be visited"); + for (Node *N = T->First; N; N = N->Next) { + if (!N->Ref) { + assert(N->Kind == TreeEntry::Tree); + Worklist.push_back(static_cast(N)); + } + } + T->Visited = true; + continue; + } + + Worklist.pop_back(); + for (Node *N = T->First; N; N = N->Next) + Entries.emplace_back(*N->Ref, N->Kind, N->Name); + Expected ExpectedTree = Schema.create(Entries); + Entries.clear(); + if (!ExpectedTree) + return ExpectedTree.takeError(); + T->Ref = ExpectedTree->getRef(); + } + + Expected Obj = cantFail(CAS.getProxy(*Root.Ref)); +#ifndef NDEBUG + if (Obj) { + if (Error E = CAS.validateTree(Obj->getRef())) + return std::move(E); + } +#endif + return Obj; +} diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp new file mode 100644 index 00000000000000..378e822d284207 --- /dev/null +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -0,0 +1,321 @@ +//===- InMemoryCAS.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/ADT/LazyAtomicPointer.h" +#include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/PointerUnion.h" +#include "llvm/ADT/TrieRawHashMap.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/ThreadSafeAllocator.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class InMemoryObject; + +/// Index of referenced IDs (map: Hash -> InMemoryObject*). Uses +/// LazyAtomicPointer to coordinate creation of objects. +using InMemoryIndexT = + ThreadSafeTrieRawHashMap, + sizeof(HashType)>; + +/// Values in \a InMemoryIndexT. \a InMemoryObject's point at this to access +/// their hash. +using InMemoryIndexValueT = InMemoryIndexT::value_type; + +class InMemoryObject { +public: + enum class Kind { + /// Node with refs and data. + RefNode, + + /// Node with refs and data co-allocated. + InlineNode, + + Max = InlineNode, + }; + + Kind getKind() const { return IndexAndKind.getInt(); } + const InMemoryIndexValueT &getIndex() const { + assert(IndexAndKind.getPointer()); + return *IndexAndKind.getPointer(); + } + + ArrayRef getHash() const { return getIndex().Hash; } + + InMemoryObject() = delete; + InMemoryObject(InMemoryObject &&) = delete; + InMemoryObject(const InMemoryObject &) = delete; + +protected: + InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {} + +private: + enum Counts : int { + NumKindBits = 2, + }; + PointerIntPair IndexAndKind; + static_assert((1U << NumKindBits) <= alignof(InMemoryIndexValueT), + "Kind will clobber pointer"); + static_assert(((int)Kind::Max >> NumKindBits) == 0, "Kind will be truncated"); + +public: + inline ArrayRef getData() const; + + inline ArrayRef getRefs() const; +}; + +class InMemoryRefObject : public InMemoryObject { +public: + static constexpr Kind KindValue = Kind::RefNode; + static bool classof(const InMemoryObject *O) { + return O->getKind() == KindValue; + } + + ArrayRef getRefsImpl() const { return Refs; } + ArrayRef getRefs() const { return Refs; } + ArrayRef getDataImpl() const { return Data; } + ArrayRef getData() const { return Data; } + + static InMemoryRefObject &create(function_ref Allocate, + const InMemoryIndexValueT &I, + ArrayRef Refs, + ArrayRef Data) { + void *Mem = Allocate(sizeof(InMemoryRefObject)); + return *new (Mem) InMemoryRefObject(I, Refs, Data); + } + +private: + InMemoryRefObject(const InMemoryIndexValueT &I, + ArrayRef Refs, ArrayRef Data) + : InMemoryObject(KindValue, I), Refs(Refs), Data(Data) { + assert(isAddrAligned(Align(8), this) && "Expected 8-byte alignment"); + assert(isAddrAligned(Align(8), Data.data()) && "Expected 8-byte alignment"); + assert(*Data.end() == 0 && "Expected null-termination"); + } + + ArrayRef Refs; + ArrayRef Data; +}; + +class InMemoryInlineObject : public InMemoryObject { +public: + static constexpr Kind KindValue = Kind::InlineNode; + static bool classof(const InMemoryObject *O) { + return O->getKind() == KindValue; + } + + ArrayRef getRefs() const { return getRefsImpl(); } + ArrayRef getRefsImpl() const { + return ArrayRef(reinterpret_cast(this + 1), + NumRefs); + } + + ArrayRef getData() const { return getDataImpl(); } + ArrayRef getDataImpl() const { + ArrayRef Refs = getRefs(); + return ArrayRef(reinterpret_cast(Refs.data() + Refs.size()), + DataSize); + } + + static InMemoryInlineObject & + create(function_ref Allocate, + const InMemoryIndexValueT &I, ArrayRef Refs, + ArrayRef Data) { + void *Mem = Allocate(sizeof(InMemoryInlineObject) + + sizeof(uintptr_t) * Refs.size() + Data.size() + 1); + return *new (Mem) InMemoryInlineObject(I, Refs, Data); + } + +private: + InMemoryInlineObject(const InMemoryIndexValueT &I, + ArrayRef Refs, + ArrayRef Data) + : InMemoryObject(KindValue, I), NumRefs(Refs.size()), + DataSize(Data.size()) { + auto *BeginRefs = reinterpret_cast(this + 1); + llvm::copy(Refs, BeginRefs); + auto *BeginData = reinterpret_cast(BeginRefs + NumRefs); + llvm::copy(Data, BeginData); + BeginData[Data.size()] = 0; + } + uint32_t NumRefs; + uint32_t DataSize; +}; + +/// In-memory CAS database and action cache (the latter should be separated). +class InMemoryCAS : public BuiltinCAS { +public: + Expected storeImpl(ArrayRef ComputedHash, + ArrayRef Refs, + ArrayRef Data) final; + + Expected + storeFromNullTerminatedRegion(ArrayRef ComputedHash, + sys::fs::mapped_file_region Map) override; + + CASID getID(const InMemoryIndexValueT &I) const { + StringRef Hash = toStringRef(I.Hash); + return CASID::create(&getContext(), Hash); + } + CASID getID(const InMemoryObject &O) const { return getID(O.getIndex()); } + + ObjectHandle getObjectHandle(const InMemoryObject &Node) const { + assert(!(reinterpret_cast(&Node) & 0x1ULL)); + return makeObjectHandle(reinterpret_cast(&Node)); + } + + Expected> loadIfExists(ObjectRef Ref) override { + return getObjectHandle(asInMemoryObject(Ref)); + } + + InMemoryIndexValueT &indexHash(ArrayRef Hash) { + return *Index.insertLazy( + Hash, [](auto ValueConstructor) { ValueConstructor.emplace(nullptr); }); + } + + /// TODO: Consider callers to actually do an insert and to return a handle to + /// the slot in the trie. + const InMemoryObject *getInMemoryObject(CASID ID) const { + assert(ID.getContext().getHashSchemaIdentifier() == + getContext().getHashSchemaIdentifier() && + "Expected ID from same hash schema"); + if (InMemoryIndexT::const_pointer P = Index.find(ID.getHash())) + return P->Data; + return nullptr; + } + + const InMemoryObject &getInMemoryObject(ObjectHandle OH) const { + return *reinterpret_cast( + (uintptr_t)OH.getInternalRef(*this)); + } + + const InMemoryObject &asInMemoryObject(ReferenceBase Ref) const { + uintptr_t P = Ref.getInternalRef(*this); + return *reinterpret_cast(P); + } + ObjectRef toReference(const InMemoryObject &O) const { + return makeObjectRef(reinterpret_cast(&O)); + } + + CASID getID(ObjectRef Ref) const final { return getIDImpl(Ref); } + CASID getIDImpl(ReferenceBase Ref) const { + return getID(asInMemoryObject(Ref)); + } + + std::optional getReference(const CASID &ID) const final { + if (const InMemoryObject *Object = getInMemoryObject(ID)) + return toReference(*Object); + return std::nullopt; + } + + Expected isMaterialized(ObjectRef Ref) const final { return true; } + + ArrayRef getDataConst(ObjectHandle Node) const final { + return cast(asInMemoryObject(Node)).getData(); + } + + InMemoryCAS() = default; + +private: + size_t getNumRefs(ObjectHandle Node) const final { + return getInMemoryObject(Node).getRefs().size(); + } + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + return toReference(*getInMemoryObject(Node).getRefs()[I]); + } + Error forEachRef(ObjectHandle Node, + function_ref Callback) const final; + + /// Index of referenced IDs (map: Hash -> InMemoryObject*). Mapped to nullptr + /// as a convenient way to store hashes. + /// + /// - Insert nullptr on lookups. + /// - InMemoryObject points back to here. + InMemoryIndexT Index; + + ThreadSafeAllocator Objects; + ThreadSafeAllocator> + MemoryMaps; +}; + +} // end anonymous namespace + +ArrayRef InMemoryObject::getData() const { + if (auto *Derived = dyn_cast(this)) + return Derived->getDataImpl(); + return cast(this)->getDataImpl(); +} + +ArrayRef InMemoryObject::getRefs() const { + if (auto *Derived = dyn_cast(this)) + return Derived->getRefsImpl(); + return cast(this)->getRefsImpl(); +} + +Expected +InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef ComputedHash, + sys::fs::mapped_file_region Map) { + // Look up the hash in the index, initializing to nullptr if it's new. + ArrayRef Data(Map.data(), Map.size()); + auto &I = indexHash(ComputedHash); + + // Load or generate. + auto Allocator = [&](size_t Size) -> void * { + return Objects.Allocate(Size, alignof(InMemoryObject)); + }; + auto Generator = [&]() -> const InMemoryObject * { + return &InMemoryRefObject::create(Allocator, I, std::nullopt, Data); + }; + const InMemoryObject &Node = + cast(I.Data.loadOrGenerate(Generator)); + + // Save Map if the winning node uses it. + if (auto *RefNode = dyn_cast(&Node)) + if (RefNode->getData().data() == Map.data()) + new (MemoryMaps.Allocate(1)) sys::fs::mapped_file_region(std::move(Map)); + + return toReference(Node); +} + +Expected InMemoryCAS::storeImpl(ArrayRef ComputedHash, + ArrayRef Refs, + ArrayRef Data) { + // Look up the hash in the index, initializing to nullptr if it's new. + auto &I = indexHash(ComputedHash); + + // Create the node. + SmallVector InternalRefs; + for (ObjectRef Ref : Refs) + InternalRefs.push_back(&asInMemoryObject(Ref)); + auto Allocator = [&](size_t Size) -> void * { + return Objects.Allocate(Size, alignof(InMemoryObject)); + }; + auto Generator = [&]() -> const InMemoryObject * { + return &InMemoryInlineObject::create(Allocator, I, InternalRefs, Data); + }; + return toReference(cast(I.Data.loadOrGenerate(Generator))); +} + +Error InMemoryCAS::forEachRef(ObjectHandle Handle, + function_ref Callback) const { + auto &Node = getInMemoryObject(Handle); + for (const InMemoryObject *Ref : Node.getRefs()) + if (Error E = Callback(toReference(*Ref))) + return E; + return Error::success(); +} + +std::unique_ptr cas::createInMemoryCAS() { + return std::make_unique(); +} diff --git a/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp new file mode 100644 index 00000000000000..157871f2dab716 --- /dev/null +++ b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp @@ -0,0 +1,284 @@ +//===- MappedFileRegionBumpPtr.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A bump pointer allocator, backed by a memory-mapped file. +/// +/// The effect we want is: +/// +/// 1. If it doesn't exist, create the file with an initial size. +/// 2. Reserve virtual memory large enough for the max file size. +/// 3. Map the file into memory in the reserved region. +/// 4. Increase the file size and update the mapping when necessary. +/// +/// However, updating the mapping is challenging when it needs to work portably, +/// and across multiple processes without locking for every read. Our current +/// implementation strategy is: +/// +/// 1. Use \c ftruncate (\c sys::fs::resize_file) to grow the file to its max +/// size (typically several GB). Many modern filesystems will create a sparse +/// file, so that the trailing unused pages do not take space on disk. +/// 2. Call \c mmap (\c sys::fs::mapped_file_region) +/// 3. [Automatic as part of 2.] +/// 4. [Automatic as part of 2.] +/// +/// Additionally, we attempt to resize the file to its actual data size when +/// closing the mapping, if this is the only concurrent instance. This is done +/// using file locks. Shrinking the file mitigates problems with having large +/// files: on filesystems without sparse files it avoids unnecessary space use; +/// it also avoids allocating the full size if another process copies the file, +/// which typically loses sparseness. These mitigations only work while the file +/// is not in use. +/// +/// FIXME: we assume that all concurrent users of the file will use the same +/// value for Capacity. Otherwise a process with a larger capacity can write +/// data that is "out of bounds" for processes with smaller capacity. Currently +/// this is true in the CAS. +/// +/// To support resizing, we use two separate file locks: +/// 1. We use a shared reader lock on a ".shared" file until destruction. +/// 2. We use a lock on the main file during initialization - shared to check +/// the status, upgraded to exclusive to resize/initialize the file. +/// +/// Then during destruction we attempt to get exclusive access on (1), which +/// requires no concurrent readers. If so, we shrink the file. Using two +/// separate locks simplifies the implementation and enables it to work on +/// platforms (e.g. Windows) where a shared/reader lock prevents writing. +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/MappedFileRegionBumpPtr.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/StringMap.h" +#include + +using namespace llvm; +using namespace llvm::cas; + +namespace { +struct FileLockRAII { + std::string Path; + int FD; + enum LockKind { Shared, Exclusive }; + std::optional Locked; + + FileLockRAII(std::string Path, int FD) : Path(std::move(Path)), FD(FD) {} + ~FileLockRAII() { consumeError(unlock()); } + + Error lock(LockKind LK) { + if (std::error_code EC = sys::fs::lockFile(FD, LK == Exclusive)) + return createFileError(Path, EC); + Locked = LK; + return Error::success(); + } + + Error unlock() { + if (Locked) { + Locked = std::nullopt; + if (std::error_code EC = sys::fs::unlockFile(FD)) + return createFileError(Path, EC); + } + return Error::success(); + } +}; +} // end anonymous namespace + +Expected MappedFileRegionBumpPtr::create( + const Twine &Path, uint64_t Capacity, int64_t BumpPtrOffset, + function_ref NewFileConstructor) { + MappedFileRegionBumpPtr Result; + Result.Path = Path.str(); + // Open the main file. + int FD; + if (std::error_code EC = sys::fs::openFileForReadWrite( + Result.Path, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(Path, EC); + Result.FD = FD; + + // Open the shared lock file. See file comment for details of locking scheme. + SmallString<128> SharedLockPath(Result.Path); + SharedLockPath.append(".shared"); + int SharedLockFD; + if (std::error_code EC = sys::fs::openFileForReadWrite( + SharedLockPath, SharedLockFD, sys::fs::CD_OpenAlways, + sys::fs::OF_None)) + return createFileError(SharedLockPath, EC); + Result.SharedLockFD = SharedLockFD; + + // Take shared/reader lock that will be held until we close the file; unlocked + // by destroyImpl. + if (std::error_code EC = sys::fs::lockFile(SharedLockFD, /*Exclusive=*/false)) + return createFileError(Path, EC); + + // Take shared/reader lock for initialization. + FileLockRAII InitLock(Result.Path, FD); + if (Error E = InitLock.lock(FileLockRAII::Shared)) + return std::move(E); + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + sys::fs::file_status Status; + if (std::error_code EC = sys::fs::status(File, Status)) + return createFileError(Result.Path, EC); + + if (Status.getSize() < Capacity) { + // Lock the file exclusively so only one process will do the initialization. + if (Error E = InitLock.unlock()) + return std::move(E); + if (Error E = InitLock.lock(FileLockRAII::Exclusive)) + return std::move(E); + // Retrieve the current size now that we have exclusive access. + if (std::error_code EC = sys::fs::status(File, Status)) + return createFileError(Result.Path, EC); + } + + // At this point either the file is still under-sized, or we have the size for + // the completely initialized file. + + if (Status.getSize() < Capacity) { + // We are initializing the file; it may be empty, or may have been shrunk + // during a previous close. + // FIXME: Detect a case where someone opened it with a smaller capacity. + // FIXME: On Windows we should use FSCTL_SET_SPARSE and FSCTL_SET_ZERO_DATA + // to make this a sparse region, if supported. + if (std::error_code EC = sys::fs::resize_file(FD, Capacity)) + return createFileError(Result.Path, EC); + } else { + // Someone else initialized it. + Capacity = Status.getSize(); + } + + // Create the mapped region. + { + std::error_code EC; + sys::fs::mapped_file_region Map( + File, sys::fs::mapped_file_region::readwrite, Capacity, 0, EC); + if (EC) + return createFileError(Result.Path, EC); + Result.Region = std::move(Map); + } + + if (Status.getSize() == 0) { + // We are creating a new file; run the constructor. + if (Error E = NewFileConstructor(Result)) + return std::move(E); + } else { + Result.initializeBumpPtr(BumpPtrOffset); + } + + return Result; +} + +Expected> +MappedFileRegionBumpPtr::createShared( + const Twine &PathTwine, uint64_t Capacity, int64_t BumpPtrOffset, + function_ref NewFileConstructor) { + struct MapNode { + std::mutex Mutex; + std::weak_ptr MFR; + }; + static std::mutex Mutex; + + // FIXME: Map should be by sys::fs::UniqueID instead of by path. Here's how + // it should work: + // + // 1. Open the file. + // 2. Stat the file descriptor to get the UniqueID. + // 3. Check the map. + // 4. If new, pass the open file descriptor to a helper extracted from + // MappedFileRegionBumpPtr::create(). + static StringMap Regions; + + SmallString<128> PathStorage; + const StringRef Path = PathTwine.toStringRef(PathStorage); + + MapNode *Node; + { + std::lock_guard Lock(Mutex); + Node = &Regions[Path]; + } + + if (std::shared_ptr MFR = Node->MFR.lock()) + return MFR; + + // Construct a new region. Use a fine-grained lock to allow other regions to + // be opened concurrently. + std::lock_guard Lock(Node->Mutex); + + // Open / create / initialize files on disk. + Expected ExpectedMFR = + MappedFileRegionBumpPtr::create(Path, Capacity, BumpPtrOffset, + NewFileConstructor); + if (!ExpectedMFR) + return ExpectedMFR.takeError(); + + auto SharedMFR = + std::make_shared(std::move(*ExpectedMFR)); + + // Success. + Node->MFR = SharedMFR; + return std::move(SharedMFR); +} + +void MappedFileRegionBumpPtr::destroyImpl() { + if (!FD) + return; + + // Drop the shared lock indicating we are no longer accessing the file. + if (SharedLockFD) + (void)sys::fs::unlockFile(*SharedLockFD); + + // Attempt to truncate the file if we can get exclusive access. Ignore any + // errors. + if (BumpPtr) { + assert(SharedLockFD && "Must have shared lock file open"); + if (sys::fs::tryLockFile(*SharedLockFD) == std::error_code()) { + assert(size() <= capacity()); + (void)sys::fs::resize_file(*FD, size()); + (void)sys::fs::unlockFile(*SharedLockFD); + } + } + + auto Close = [](std::optional &FD) { + if (FD) { + sys::fs::file_t File = sys::fs::convertFDToNativeFile(*FD); + sys::fs::closeFile(File); + FD = std::nullopt; + } + }; + + // Close the file and shared lock. + Close(FD); + Close(SharedLockFD); +} + +void MappedFileRegionBumpPtr::initializeBumpPtr(int64_t BumpPtrOffset) { + assert(capacity() < (uint64_t)INT64_MAX && "capacity must fit in int64_t"); + int64_t BumpPtrEndOffset = BumpPtrOffset + sizeof(decltype(*BumpPtr)); + assert(BumpPtrEndOffset <= (int64_t)capacity() && + "Expected end offset to be pre-allocated"); + assert(isAligned(Align::Of(), BumpPtrOffset) && + "Expected end offset to be aligned"); + BumpPtr = reinterpret_cast(data() + BumpPtrOffset); + + int64_t ExistingValue = 0; + if (!BumpPtr->compare_exchange_strong(ExistingValue, BumpPtrEndOffset)) + assert(ExistingValue >= BumpPtrEndOffset && + "Expected 0, or past the end of the BumpPtr itself"); +} + +int64_t MappedFileRegionBumpPtr::allocateOffset(uint64_t AllocSize) { + AllocSize = alignTo(AllocSize, getAlign()); + int64_t OldEnd = BumpPtr->fetch_add(AllocSize); + int64_t NewEnd = OldEnd + AllocSize; + if (LLVM_UNLIKELY(NewEnd > (int64_t)capacity())) { + // Try to return the allocation. + (void)BumpPtr->compare_exchange_strong(OldEnd, NewEnd); + report_fatal_error( + errorCodeToError(std::make_error_code(std::errc::not_enough_memory))); + } + return OldEnd; +} diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp new file mode 100644 index 00000000000000..4d58dfb1157a12 --- /dev/null +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -0,0 +1,206 @@ +//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ObjectStore.h" +#include "BuiltinCAS.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FunctionExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/SmallVectorMemoryBuffer.h" + +using namespace llvm; +using namespace llvm::cas; + +void CASContext::anchor() {} +void ObjectStore::anchor() {} + +LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } + +std::string CASID::toString() const { + std::string S; + raw_string_ostream(S) << *this; + return S; +} + +static void printReferenceBase(raw_ostream &OS, StringRef Kind, + uint64_t InternalRef, std::optional ID) { + OS << Kind << "=" << InternalRef; + if (ID) + OS << "[" << *ID << "]"; +} + +void ReferenceBase::print(raw_ostream &OS, const ObjectHandle &This) const { + assert(this == &This); + printReferenceBase(OS, "object-handle", InternalRef, std::nullopt); +} + +void ReferenceBase::print(raw_ostream &OS, const ObjectRef &This) const { + assert(this == &This); + + std::optional ID; +#if LLVM_ENABLE_ABI_BREAKING_CHECKS + if (CAS) + ID = CAS->getID(This); +#endif + printReferenceBase(OS, "object-ref", InternalRef, ID); +} + +void ObjectStore::loadIfExistsAsync( + ObjectRef Ref, + unique_function>)> Callback) { + // The default implementation is synchronous. + Callback(loadIfExists(Ref)); +} + +Expected ObjectStore::load(ObjectRef Ref) { + std::optional Handle; + if (Error E = loadIfExists(Ref).moveInto(Handle)) + return std::move(E); + if (!Handle) + return createStringError(errc::invalid_argument, + "missing object '" + getID(Ref).toString() + "'"); + return *Handle; +} + +std::unique_ptr +ObjectStore::getMemoryBuffer(ObjectHandle Node, StringRef Name, + bool RequiresNullTerminator) { + return MemoryBuffer::getMemBuffer( + toStringRef(getData(Node, RequiresNullTerminator)), Name, + RequiresNullTerminator); +} + +void ObjectStore::readRefs(ObjectHandle Node, + SmallVectorImpl &Refs) const { + consumeError(forEachRef(Node, [&Refs](ObjectRef Ref) -> Error { + Refs.push_back(Ref); + return Error::success(); + })); +} + +Expected ObjectStore::getProxy(const CASID &ID) { + std::optional Ref = getReference(ID); + if (!Ref) + return createUnknownObjectError(ID); + + return getProxy(*Ref); +} + +Expected ObjectStore::getProxy(ObjectRef Ref) { + std::optional H; + if (Error E = load(Ref).moveInto(H)) + return std::move(E); + + return ObjectProxy::load(*this, Ref, *H); +} + +Expected> +ObjectStore::getProxyIfExists(ObjectRef Ref) { + std::optional H; + if (Error E = loadIfExists(Ref).moveInto(H)) + return std::move(E); + if (!H) + return std::nullopt; + return ObjectProxy::load(*this, Ref, *H); +} + +std::future ObjectStore::getProxyFuture(ObjectRef Ref) { + std::promise Promise; + auto Future = Promise.get_future(); + getProxyAsync(Ref, [Promise = std::move(Promise)]( + Expected> Obj) mutable { + Promise.set_value(std::move(Obj)); + }); + return Future; +} + +void ObjectStore::getProxyAsync( + ObjectRef Ref, + unique_function>)> Callback) { + // FIXME: there is potential for use-after-free for the 'this' pointer. + // Either we should always allocate shared pointers for \c ObjectStore objects + // and pass \c shared_from_this() or expect that the caller will not release + // the \c ObjectStore before the callback returns. + return loadIfExistsAsync( + Ref, [this, Ref, Callback = std::move(Callback)]( + Expected> H) mutable { + if (!H) + Callback(H.takeError()); + else if (!*H) + Callback(std::nullopt); + else + Callback(ObjectProxy::load(*this, Ref, **H)); + }); +} + +Error ObjectStore::createUnknownObjectError(const CASID &ID) { + return createStringError(std::make_error_code(std::errc::invalid_argument), + "unknown object '" + ID.toString() + "'"); +} + +Expected ObjectStore::createProxy(ArrayRef Refs, + StringRef Data) { + Expected Ref = store(Refs, arrayRefFromStringRef(Data)); + if (!Ref) + return Ref.takeError(); + return getProxy(*Ref); +} + +Expected +ObjectStore::storeFromOpenFileImpl(sys::fs::file_t FD, + std::optional Status) { + // Copy the file into an immutable memory buffer and call \c store on that. + // Using \c mmap would be unsafe because there's a race window between when we + // get the digest hash for the \c mmap contents and when we store the data; if + // the file changes in-between we will create an invalid object. + + // FIXME: For the on-disk CAS implementation use cloning to store it as a + // standalone file if the file-system supports it and the file is large. + + constexpr size_t ChunkSize = 4 * 4096; + SmallString<0> Data; + Data.reserve(ChunkSize * 2); + if (Error E = sys::fs::readNativeFileToEOF(FD, Data, ChunkSize)) + return std::move(E); + return store(std::nullopt, ArrayRef(Data.data(), Data.size())); +} + +Error ObjectStore::validateTree(ObjectRef Root) { + SmallDenseSet ValidatedRefs; + SmallVector RefsToValidate; + RefsToValidate.push_back(Root); + + while (!RefsToValidate.empty()) { + ObjectRef Ref = RefsToValidate.pop_back_val(); + auto [I, Inserted] = ValidatedRefs.insert(Ref); + if (!Inserted) + continue; // already validated. + if (Error E = validate(getID(Ref))) + return E; + Expected Obj = load(Ref); + if (!Obj) + return Obj.takeError(); + if (Error E = forEachRef(*Obj, [&RefsToValidate](ObjectRef R) -> Error { + RefsToValidate.push_back(R); + return Error::success(); + })) + return E; + } + return Error::success(); +} + +std::unique_ptr +ObjectProxy::getMemoryBuffer(StringRef Name, + bool RequiresNullTerminator) const { + return CAS->getMemoryBuffer(H, Name, RequiresNullTerminator); +} diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp new file mode 100644 index 00000000000000..85ea5cae7318ed --- /dev/null +++ b/llvm/lib/CAS/OnDiskCAS.cpp @@ -0,0 +1,205 @@ +//===- OnDiskCAS.cpp --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Path.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class OnDiskCAS : public BuiltinCAS { +public: + Expected storeImpl(ArrayRef ComputedHash, + ArrayRef Refs, + ArrayRef Data) final; + + Expected> loadIfExists(ObjectRef Ref) final; + + CASID getID(ObjectRef Ref) const final; + + std::optional getReference(const CASID &ID) const final; + + Expected isMaterialized(ObjectRef Ref) const final; + + ArrayRef getDataConst(ObjectHandle Node) const final; + + void print(raw_ostream &OS) const final; + + static Expected> open(StringRef Path); + + OnDiskCAS(std::shared_ptr UniDB_) + : UniDB(std::move(UniDB_)), DB(&UniDB->getGraphDB()) {} + +private: + ObjectHandle convertHandle(ondisk::ObjectHandle Node) const { + return makeObjectHandle(Node.getOpaqueData()); + } + + ondisk::ObjectHandle convertHandle(ObjectHandle Node) const { + return ondisk::ObjectHandle::fromOpaqueData(Node.getInternalRef(*this)); + } + + ObjectRef convertRef(ondisk::ObjectID Ref) const { + return makeObjectRef(Ref.getOpaqueData()); + } + + ondisk::ObjectID convertRef(ObjectRef Ref) const { + return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this)); + } + + size_t getNumRefs(ObjectHandle Node) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return std::distance(RefsRange.begin(), RefsRange.end()); + } + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return convertRef(RefsRange.begin()[I]); + } + Error forEachRef(ObjectHandle Node, + function_ref Callback) const final; + + OnDiskCAS(std::unique_ptr DB_) + : OwnedDB(std::move(DB_)), DB(OwnedDB.get()) {} + + std::unique_ptr OwnedDB; + std::shared_ptr UniDB; + ondisk::OnDiskGraphDB *DB; +}; + +} // end anonymous namespace + +void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); } + +CASID OnDiskCAS::getID(ObjectRef Ref) const { + ArrayRef Hash = DB->getDigest(convertRef(Ref)); + return CASID::create(&getContext(), toStringRef(Hash)); +} + +std::optional OnDiskCAS::getReference(const CASID &ID) const { + std::optional ObjID = + DB->getExistingReference(ID.getHash()); + if (!ObjID) + return std::nullopt; + return convertRef(*ObjID); +} + +Expected OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const { + return DB->containsObject(convertRef(ExternalRef)); +} + +ArrayRef OnDiskCAS::getDataConst(ObjectHandle Node) const { + return DB->getObjectData(convertHandle(Node)); +} + +Expected> +OnDiskCAS::loadIfExists(ObjectRef ExternalRef) { + Expected> ObjHnd = + DB->load(convertRef(ExternalRef)); + if (!ObjHnd) + return ObjHnd.takeError(); + if (!*ObjHnd) + return std::nullopt; + return convertHandle(**ObjHnd); +} + +Expected OnDiskCAS::storeImpl(ArrayRef ComputedHash, + ArrayRef Refs, + ArrayRef Data) { + SmallVector IDs; + IDs.reserve(Refs.size()); + for (ObjectRef Ref : Refs) { + IDs.push_back(convertRef(Ref)); + } + + ondisk::ObjectID StoredID = DB->getReference(ComputedHash); + if (Error E = DB->store(StoredID, IDs, Data)) + return std::move(E); + return convertRef(StoredID); +} + +Error OnDiskCAS::forEachRef(ObjectHandle Node, + function_ref Callback) const { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + for (ondisk::ObjectID Ref : RefsRange) { + if (Error E = Callback(convertRef(Ref))) + return E; + } + return Error::success(); +} + +Expected> OnDiskCAS::open(StringRef AbsPath) { + Expected> DB = + ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(), + sizeof(HashType)); + if (!DB) + return DB.takeError(); + return std::unique_ptr(new OnDiskCAS(std::move(*DB))); +} + +bool cas::isOnDiskCASEnabled() { +#if LLVM_ENABLE_ONDISK_CAS + return true; +#else + return false; +#endif +} + +static constexpr StringLiteral DefaultName = "cas"; + +static void getDefaultOnDiskCASStableID(SmallVectorImpl &Path) { + Path.assign(DefaultDirProxy.begin(), DefaultDirProxy.end()); + llvm::sys::path::append(Path, DefaultDir, DefaultName); +} + +static std::string getDefaultOnDiskCASStableID() { + SmallString<128> Path; + getDefaultOnDiskCASStableID(Path); + return Path.str().str(); +} + +Expected> cas::createOnDiskCAS(const Twine &Path) { +#if LLVM_ENABLE_ONDISK_CAS + // FIXME: An absolute path isn't really good enough. Should open a directory + // and use openat() for files underneath. + SmallString<256> AbsPath; + Path.toVector(AbsPath); + sys::fs::make_absolute(AbsPath); + + // FIXME: Remove this and update clients to do this logic. + if (AbsPath == getDefaultOnDiskCASStableID()) + AbsPath = StringRef(getDefaultOnDiskCASPath()); + + return OnDiskCAS::open(AbsPath); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled"); +#endif /* LLVM_ENABLE_ONDISK_CAS */ +} + +std::unique_ptr +cas::builtin::createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr UniDB) { + return std::make_unique(std::move(UniDB)); +} + +void cas::getDefaultOnDiskCASPath(SmallVectorImpl &Path) { + // FIXME: Should this return 'Error' instead of hard-failing? + if (!llvm::sys::path::cache_directory(Path)) + report_fatal_error("cannot get default cache directory"); + llvm::sys::path::append(Path, DefaultDir, DefaultName); +} + +std::string cas::getDefaultOnDiskCASPath() { + SmallString<128> Path; + getDefaultOnDiskCASPath(Path); + return Path.str().str(); +} diff --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp new file mode 100644 index 00000000000000..718d8992379a8f --- /dev/null +++ b/llvm/lib/CAS/OnDiskCommon.cpp @@ -0,0 +1,26 @@ +//===- OnDiskCommon.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "OnDiskCommon.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +using namespace llvm; + +Expected> cas::ondisk::getOverriddenMaxMappingSize() { + constexpr const char *EnvVar = "LLVM_CAS_MAX_MAPPING_SIZE"; + const char *Value = getenv(EnvVar); + if (!Value) + return std::nullopt; + + uint64_t Size; + if (StringRef(Value).getAsInteger(/*auto*/ 0, Size)) + return createStringError(inconvertibleErrorCode(), + "invalid value for %s: expected integer", EnvVar); + return Size; +} diff --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h new file mode 100644 index 00000000000000..7394e45dc4e3de --- /dev/null +++ b/llvm/lib/CAS/OnDiskCommon.h @@ -0,0 +1,24 @@ +//===- OnDiskCommon.h -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CAS_ONDISKCOMMON_H +#define LLVM_LIB_CAS_ONDISKCOMMON_H + +#include "llvm/Support/Error.h" +#include + +namespace llvm::cas::ondisk { + +/// Retrieves an overridden maximum mapping size for CAS files, if any, by +/// checking LLVM_CAS_MAX_MAPPING_SIZE in the environment. If the value is +/// unreadable, returns an error. +Expected> getOverriddenMaxMappingSize(); + +} // namespace llvm::cas::ondisk + +#endif // LLVM_LIB_CAS_ONDISKCOMMON_H diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp new file mode 100644 index 00000000000000..4486bdb2863661 --- /dev/null +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -0,0 +1,1508 @@ +//===- OnDiskGraphDB.cpp ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// On-disk CAS nodes database, independent of a particular hashing algorithm. +// +// Here's a top-level description of the current layout (could expose or make +// this configurable in the future). +// +// Files, each with a prefix set by \a FilePrefix: +// +// - db/.index: a file for the "index" table, named by \a +// IndexTableName and managed by \a HashMappedTrie. The contents are 8B +// that are accessed atomically, describing the object kind and where/how +// it's stored (including an optional file offset). See \a TrieRecord for +// more details. +// - db/.data: a file for the "data" table, named by \a +// DataPoolTableName and managed by \a DataStore. New objects within +// TrieRecord::MaxEmbeddedSize are inserted here as \a +// TrieRecord::StorageKind::DataPool. +// - db/..data: a file storing an object outside the main +// "data" table, named by its offset into the "index" table, with the +// format of \a TrieRecord::StorageKind::Standalone. +// - db/..leaf: a file storing a leaf node outside the +// main "data" table, named by its offset into the "index" table, with +// the format of \a TrieRecord::StorageKind::StandaloneLeaf. +// - db/..leaf+0: a file storing a leaf object outside the +// main "data" table, named by its offset into the "index" table, with +// the format of \a TrieRecord::StorageKind::StandaloneLeaf0. +// +// The "index", and "data" tables could be stored in a single file, +// (using a root record that points at the two types of stores), but splitting +// the files seems more convenient for now. +// +// ObjectID: this is a pointer to Trie record +// +// ObjectHandle: this is a pointer to Data record +// +// Eventually: consider creating a StringPool for strings instead of using +// RecordDataStore table. +// - Lookup by prefix tree +// - Store by suffix tree +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskGraphDB.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" + +#define DEBUG_TYPE "on-disk-cas" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +static constexpr StringLiteral IndexTableName = "llvm.cas.index"; +static constexpr StringLiteral DataPoolTableName = "llvm.cas.data"; + +static constexpr StringLiteral IndexFile = "index"; +static constexpr StringLiteral DataPoolFile = "data"; + +static constexpr StringLiteral FilePrefix = "v8."; +static constexpr StringLiteral FileSuffixData = ".data"; +static constexpr StringLiteral FileSuffixLeaf = ".leaf"; +static constexpr StringLiteral FileSuffixLeaf0 = ".leaf+0"; + +static Error createCorruptObjectError(ArrayRef ID) { + return createStringError(llvm::errc::invalid_argument, + "corrupt object '" + toHex(ID) + "'"); +} + +namespace { + +/// Trie record data: 8B, atomic +/// - 1-byte: StorageKind +/// - 7-bytes: DataStoreOffset (offset into referenced file) +class TrieRecord { +public: + enum class StorageKind : uint8_t { + /// Unknown object. + Unknown = 0, + + /// vX.data: main pool, full DataStore record. + DataPool = 1, + + /// vX..data: standalone, with a full DataStore record. + Standalone = 10, + + /// vX..leaf: standalone, just the data. File contents + /// exactly the data content and file size matches the data size. No refs. + StandaloneLeaf = 11, + + /// vX..leaf+0: standalone, just the data plus an + /// extra null character ('\0'). File size is 1 bigger than the data size. + /// No refs. + StandaloneLeaf0 = 12, + }; + + static StringRef getStandaloneFileSuffix(StorageKind SK) { + switch (SK) { + default: + llvm_unreachable("Expected standalone storage kind"); + case TrieRecord::StorageKind::Standalone: + return FileSuffixData; + case TrieRecord::StorageKind::StandaloneLeaf0: + return FileSuffixLeaf0; + case TrieRecord::StorageKind::StandaloneLeaf: + return FileSuffixLeaf; + } + } + + enum Limits : int64_t { + // Saves files bigger than 64KB standalone instead of embedding them. + MaxEmbeddedSize = 64LL * 1024LL - 1, + }; + + struct Data { + StorageKind SK = StorageKind::Unknown; + FileOffset Offset; + }; + + static uint64_t pack(Data D) { + assert(D.Offset.get() < (int64_t)(1ULL << 56)); + uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get(); + assert(D.SK != StorageKind::Unknown || Packed == 0); +#ifndef NDEBUG + Data RoundTrip = unpack(Packed); + assert(D.SK == RoundTrip.SK); + assert(D.Offset.get() == RoundTrip.Offset.get()); +#endif + return Packed; + } + + static Data unpack(uint64_t Packed) { + Data D; + if (!Packed) + return D; + D.SK = (StorageKind)(Packed >> 56); + D.Offset = FileOffset(Packed & (UINT64_MAX >> 8)); + return D; + } + + TrieRecord() : Storage(0) {} + + Data load() const { return unpack(Storage); } + bool compare_exchange_strong(Data &Existing, Data New); + +private: + std::atomic Storage; +}; + +/// DataStore record data: 4B + size? + refs? + data + 0 +/// - 4-bytes: Header +/// - {0,4,8}-bytes: DataSize (may be packed in Header) +/// - {0,4,8}-bytes: NumRefs (may be packed in Header) +/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned) +/// - +/// - 1-byte: 0-term +struct DataRecordHandle { + /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment + /// convenience to avoid computing padding later. + enum class NumRefsFlags : uint8_t { + Uses0B = 0U, + Uses1B = 1U, + Uses2B = 2U, + Uses4B = 3U, + Uses8B = 4U, + Max = Uses8B, + }; + + /// DataSize storage: 8B, 4B, 2B, or 1B. + enum class DataSizeFlags { + Uses1B = 0U, + Uses2B = 1U, + Uses4B = 2U, + Uses8B = 3U, + Max = Uses8B, + }; + + /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B. + enum class RefKindFlags { + InternalRef = 0U, + InternalRef4B = 1U, + Max = InternalRef4B, + }; + + enum Counts : int { + NumRefsShift = 0, + NumRefsBits = 3, + DataSizeShift = NumRefsShift + NumRefsBits, + DataSizeBits = 2, + RefKindShift = DataSizeShift + DataSizeBits, + RefKindBits = 1, + }; + static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) == + 0, + "Not enough bits"); + static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) == + 0, + "Not enough bits"); + static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) == + 0, + "Not enough bits"); + + struct LayoutFlags { + NumRefsFlags NumRefs; + DataSizeFlags DataSize; + RefKindFlags RefKind; + + static uint64_t pack(LayoutFlags LF) { + unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) | + ((unsigned)LF.DataSize << DataSizeShift) | + ((unsigned)LF.RefKind << RefKindShift); +#ifndef NDEBUG + LayoutFlags RoundTrip = unpack(Packed); + assert(LF.NumRefs == RoundTrip.NumRefs); + assert(LF.DataSize == RoundTrip.DataSize); + assert(LF.RefKind == RoundTrip.RefKind); +#endif + return Packed; + } + static LayoutFlags unpack(uint64_t Storage) { + assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte"); + LayoutFlags LF; + LF.NumRefs = + (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1)); + LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) & + ((1U << DataSizeBits) - 1)); + LF.RefKind = + (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1)); + return LF; + } + }; + + /// Header layout: + /// - 1-byte: LayoutFlags + /// - 1-byte: 1B size field + /// - {0,2}-bytes: 2B size field + struct Header { + using PackTy = uint32_t; + PackTy Packed; + + static constexpr unsigned LayoutFlagsShift = + (sizeof(PackTy) - 1) * CHAR_BIT; + }; + + struct Input { + InternalRefArrayRef Refs; + ArrayRef Data; + }; + + LayoutFlags getLayoutFlags() const { + return LayoutFlags::unpack(H->Packed >> Header::LayoutFlagsShift); + } + + uint64_t getDataSize() const; + void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const; + uint32_t getNumRefs() const; + void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const; + int64_t getRefsRelOffset() const; + int64_t getDataRelOffset() const; + + static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) { + return DataRelOffset + DataSize + 1; + } + uint64_t getTotalSize() const { + return getDataRelOffset() + getDataSize() + 1; + } + + struct Layout { + explicit Layout(const Input &I); + + LayoutFlags Flags{}; + uint64_t DataSize = 0; + uint32_t NumRefs = 0; + int64_t RefsRelOffset = 0; + int64_t DataRelOffset = 0; + uint64_t getTotalSize() const { + return DataRecordHandle::getTotalSize(DataRelOffset, DataSize); + } + }; + + InternalRefArrayRef getRefs() const { + assert(H && "Expected valid handle"); + auto *BeginByte = reinterpret_cast(H) + getRefsRelOffset(); + size_t Size = getNumRefs(); + if (!Size) + return InternalRefArrayRef(); + if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B) + return ArrayRef(reinterpret_cast(BeginByte), Size); + return ArrayRef(reinterpret_cast(BeginByte), Size); + } + + ArrayRef getData() const { + assert(H && "Expected valid handle"); + return ArrayRef(reinterpret_cast(H) + getDataRelOffset(), + getDataSize()); + } + + static DataRecordHandle create(function_ref Alloc, + const Input &I); + static Expected + createWithError(function_ref(size_t Size)> Alloc, + const Input &I); + static DataRecordHandle construct(char *Mem, const Input &I); + + static DataRecordHandle get(const char *Mem) { + return DataRecordHandle( + *reinterpret_cast(Mem)); + } + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + + DataRecordHandle() = default; + explicit DataRecordHandle(const Header &H) : H(&H) {} + +private: + static DataRecordHandle constructImpl(char *Mem, const Input &I, + const Layout &L); + const Header *H = nullptr; +}; + +class StandaloneDataInMemory { +public: + OnDiskContent getContent() const; + + /// FIXME: Should be mapped_file_region instead of MemoryBuffer to drop a + /// layer of indirection. + std::unique_ptr Region; + TrieRecord::StorageKind SK; + StandaloneDataInMemory(std::unique_ptr Region, + TrieRecord::StorageKind SK) + : Region(std::move(Region)), SK(SK) { +#ifndef NDEBUG + bool IsStandalone = false; + switch (SK) { + case TrieRecord::StorageKind::Standalone: + case TrieRecord::StorageKind::StandaloneLeaf: + case TrieRecord::StorageKind::StandaloneLeaf0: + IsStandalone = true; + break; + default: + break; + } + assert(IsStandalone); +#endif + } +}; + +/// Container for "big" objects mapped in separately. +template class StandaloneDataMap { + static_assert(isPowerOf2_64(NumShards), "Expected power of 2"); + +public: + const StandaloneDataInMemory &insert(ArrayRef Hash, + TrieRecord::StorageKind SK, + std::unique_ptr Buffer); + + const StandaloneDataInMemory *lookup(ArrayRef Hash) const; + bool count(ArrayRef Hash) const { return bool(lookup(Hash)); } + +private: + struct Shard { + /// Needs to store a std::unique_ptr for a stable address identity. + DenseMap> Map; + mutable std::mutex Mutex; + }; + Shard &getShard(ArrayRef Hash) { + return const_cast( + const_cast(this)->getShard(Hash)); + } + const Shard &getShard(ArrayRef Hash) const { + static_assert(NumShards <= 256, "Expected only 8 bits of shard"); + return Shards[Hash[0] % NumShards]; + } + + Shard Shards[NumShards]; +}; + +using StandaloneDataMapTy = StandaloneDataMap<16>; + +struct InternalHandle { + FileOffset getAsFileOffset() const { return *DataOffset; } + + uint64_t getRawData() const { + if (DataOffset) { + uint64_t Raw = DataOffset->get(); + assert(!(Raw & 0x1)); + return Raw; + } + uint64_t Raw = reinterpret_cast(SDIM); + assert(!(Raw & 0x1)); + return Raw | 1; + } + + explicit InternalHandle(FileOffset DataOffset) : DataOffset(DataOffset) {} + explicit InternalHandle(uint64_t DataOffset) : DataOffset(DataOffset) {} + explicit InternalHandle(const StandaloneDataInMemory &SDIM) : SDIM(&SDIM) {} + std::optional DataOffset; + const StandaloneDataInMemory *SDIM = nullptr; +}; + +class InternalRefVector { +public: + void push_back(InternalRef Ref) { + if (NeedsFull) + return FullRefs.push_back(Ref); + if (std::optional Small = InternalRef4B::tryToShrink(Ref)) + return SmallRefs.push_back(*Small); + NeedsFull = true; + assert(FullRefs.empty()); + FullRefs.reserve(SmallRefs.size() + 1); + for (InternalRef4B Small : SmallRefs) + FullRefs.push_back(Small); + FullRefs.push_back(Ref); + SmallRefs.clear(); + } + + operator InternalRefArrayRef() const { + assert(SmallRefs.empty() || FullRefs.empty()); + return NeedsFull ? InternalRefArrayRef(FullRefs) + : InternalRefArrayRef(SmallRefs); + } + +private: + bool NeedsFull = false; + SmallVector SmallRefs; + SmallVector FullRefs; +}; + +} // namespace + +/// Proxy for any on-disk object or raw data. +struct ondisk::OnDiskContent { + std::optional Record; + std::optional> Bytes; +}; + +Expected DataRecordHandle::createWithError( + function_ref(size_t Size)> Alloc, const Input &I) { + Layout L(I); + if (Expected Mem = Alloc(L.getTotalSize())) + return constructImpl(*Mem, I, L); + else + return Mem.takeError(); +} + +DataRecordHandle +DataRecordHandle::create(function_ref Alloc, + const Input &I) { + Layout L(I); + return constructImpl(Alloc(L.getTotalSize()), I, L); +} + +/// Proxy for an on-disk index record. +struct OnDiskGraphDB::IndexProxy { + FileOffset Offset; + ArrayRef Hash; + TrieRecord &Ref; +}; + +template +const StandaloneDataInMemory & +StandaloneDataMap::insert(ArrayRef Hash, TrieRecord::StorageKind SK, + std::unique_ptr Buffer) { + auto &S = getShard(Hash); + std::lock_guard Lock(S.Mutex); + auto &V = S.Map[Hash.data()]; + if (!V) + V = std::make_unique(std::move(Buffer), SK); + return *V; +} + +template +const StandaloneDataInMemory * +StandaloneDataMap::lookup(ArrayRef Hash) const { + auto &S = getShard(Hash); + std::lock_guard Lock(S.Mutex); + auto I = S.Map.find(Hash.data()); + if (I == S.Map.end()) + return nullptr; + return &*I->second; +} + +/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too +/// expensive to register/unregister at this rate. +/// +/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp +/// files and has a signal handler registerd that removes them all. +class OnDiskGraphDB::TempFile { + bool Done = false; + TempFile(StringRef Name, int FD) : TmpName(std::string(Name)), FD(FD) {} + +public: + /// This creates a temporary file with createUniqueFile. + static Expected create(const Twine &Model); + TempFile(TempFile &&Other) { *this = std::move(Other); } + TempFile &operator=(TempFile &&Other) { + TmpName = std::move(Other.TmpName); + FD = Other.FD; + Other.Done = true; + Other.FD = -1; + return *this; + } + + // Name of the temporary file. + std::string TmpName; + + // The open file descriptor. + int FD = -1; + + // Keep this with the given name. + Error keep(const Twine &Name); + Error discard(); + + // This checks that keep or delete was called. + ~TempFile() { consumeError(discard()); } +}; + +class OnDiskGraphDB::MappedTempFile { +public: + char *data() const { return Map.data(); } + size_t size() const { return Map.size(); } + + Error discard() { + assert(Map && "Map already destroyed"); + Map.unmap(); + return Temp.discard(); + } + + Error keep(const Twine &Name) { + assert(Map && "Map already destroyed"); + Map.unmap(); + return Temp.keep(Name); + } + + MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map) + : Temp(std::move(Temp)), Map(std::move(Map)) {} + +private: + TempFile Temp; + sys::fs::mapped_file_region Map; +}; + +Error OnDiskGraphDB::TempFile::discard() { + Done = true; + if (FD != -1) { + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + if (std::error_code EC = sys::fs::closeFile(File)) + return errorCodeToError(EC); + } + FD = -1; + + // Always try to close and remove. + std::error_code RemoveEC; + if (!TmpName.empty()) + if (std::error_code EC = sys::fs::remove(TmpName)) + return errorCodeToError(EC); + TmpName = ""; + + return Error::success(); +} + +Error OnDiskGraphDB::TempFile::keep(const Twine &Name) { + assert(!Done); + Done = true; + // Always try to close and rename. + std::error_code RenameEC = sys::fs::rename(TmpName, Name); + + if (!RenameEC) + TmpName = ""; + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + if (std::error_code EC = sys::fs::closeFile(File)) + return errorCodeToError(EC); + FD = -1; + + return errorCodeToError(RenameEC); +} + +Expected +OnDiskGraphDB::TempFile::create(const Twine &Model) { + int FD; + SmallString<128> ResultPath; + if (std::error_code EC = sys::fs::createUniqueFile(Model, FD, ResultPath)) + return errorCodeToError(EC); + + TempFile Ret(ResultPath, FD); + return std::move(Ret); +} + +bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) { + uint64_t ExistingPacked = pack(Existing); + uint64_t NewPacked = pack(New); + if (Storage.compare_exchange_strong(ExistingPacked, NewPacked)) + return true; + Existing = unpack(ExistingPacked); + return false; +} + +DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) { + return constructImpl(Mem, I, Layout(I)); +} + +DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I, + const Layout &L) { + char *Next = Mem + sizeof(Header); + + // Fill in Packed and set other data, then come back to construct the header. + Header::PackTy Packed = 0; + Packed |= LayoutFlags::pack(L.Flags) << Header::LayoutFlagsShift; + + // Construct DataSize. + switch (L.Flags.DataSize) { + case DataSizeFlags::Uses1B: + assert(I.Data.size() <= UINT8_MAX); + Packed |= (Header::PackTy)I.Data.size() + << ((sizeof(Packed) - 2) * CHAR_BIT); + break; + case DataSizeFlags::Uses2B: + assert(I.Data.size() <= UINT16_MAX); + Packed |= (Header::PackTy)I.Data.size() + << ((sizeof(Packed) - 4) * CHAR_BIT); + break; + case DataSizeFlags::Uses4B: + support::endian::write32le(Next, I.Data.size()); + Next += 4; + break; + case DataSizeFlags::Uses8B: + support::endian::write64le(Next, I.Data.size()); + Next += 8; + break; + } + + // Construct NumRefs. + // + // NOTE: May be writing NumRefs even if there are zero refs in order to fix + // alignment. + switch (L.Flags.NumRefs) { + case NumRefsFlags::Uses0B: + break; + case NumRefsFlags::Uses1B: + assert(I.Refs.size() <= UINT8_MAX); + Packed |= (Header::PackTy)I.Refs.size() + << ((sizeof(Packed) - 2) * CHAR_BIT); + break; + case NumRefsFlags::Uses2B: + assert(I.Refs.size() <= UINT16_MAX); + Packed |= (Header::PackTy)I.Refs.size() + << ((sizeof(Packed) - 4) * CHAR_BIT); + break; + case NumRefsFlags::Uses4B: + support::endian::write32le(Next, I.Refs.size()); + Next += 4; + break; + case NumRefsFlags::Uses8B: + support::endian::write64le(Next, I.Refs.size()); + Next += 8; + break; + } + + // Construct Refs[]. + if (!I.Refs.empty()) { + assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B()); + ArrayRef RefsBuffer = I.Refs.getBuffer(); + llvm::copy(RefsBuffer, Next); + Next += RefsBuffer.size(); + } + + // Construct Data and the trailing null. + assert(isAddrAligned(Align(8), Next)); + llvm::copy(I.Data, Next); + Next[I.Data.size()] = 0; + + // Construct the header itself and return. + Header *H = new (Mem) Header{Packed}; + DataRecordHandle Record(*H); + assert(Record.getData() == I.Data); + assert(Record.getNumRefs() == I.Refs.size()); + assert(Record.getRefs() == I.Refs); + assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize); + assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs); + assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind); + return Record; +} + +DataRecordHandle::Layout::Layout(const Input &I) { + // Start initial relative offsets right after the Header. + uint64_t RelOffset = sizeof(Header); + + // Initialize the easy stuff. + DataSize = I.Data.size(); + NumRefs = I.Refs.size(); + + // Check refs size. + Flags.RefKind = + I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef; + + // Find the smallest slot available for DataSize. + bool Has1B = true; + bool Has2B = true; + if (DataSize <= UINT8_MAX && Has1B) { + Flags.DataSize = DataSizeFlags::Uses1B; + Has1B = false; + } else if (DataSize <= UINT16_MAX && Has2B) { + Flags.DataSize = DataSizeFlags::Uses2B; + Has2B = false; + } else if (DataSize <= UINT32_MAX) { + Flags.DataSize = DataSizeFlags::Uses4B; + RelOffset += 4; + } else { + Flags.DataSize = DataSizeFlags::Uses8B; + RelOffset += 8; + } + + // Find the smallest slot available for NumRefs. Never sets NumRefs8B here. + if (!NumRefs) { + Flags.NumRefs = NumRefsFlags::Uses0B; + } else if (NumRefs <= UINT8_MAX && Has1B) { + Flags.NumRefs = NumRefsFlags::Uses1B; + Has1B = false; + } else if (NumRefs <= UINT16_MAX && Has2B) { + Flags.NumRefs = NumRefsFlags::Uses2B; + Has2B = false; + } else { + Flags.NumRefs = NumRefsFlags::Uses4B; + RelOffset += 4; + } + + // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated + // padding rules when reading and writing. This also bumps RelOffset. + // + // The value for NumRefs is strictly limited to UINT32_MAX, but it can be + // stored as 8B. This means we can *always* find a size to grow. + // + // NOTE: Only call this once. + auto GrowSizeFieldsBy4B = [&]() { + assert(isAligned(Align(4), RelOffset)); + RelOffset += 4; + + assert(Flags.NumRefs != NumRefsFlags::Uses8B && + "Expected to be able to grow NumRefs8B"); + + // First try to grow DataSize. NumRefs will not (yet) be 8B, and if + // DataSize is upgraded to 8B it'll already be aligned. + // + // Failing that, grow NumRefs. + if (Flags.DataSize < DataSizeFlags::Uses4B) + Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B. + else if (Flags.DataSize < DataSizeFlags::Uses8B) + Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B. + else if (Flags.NumRefs < NumRefsFlags::Uses4B) + Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B. + else + Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B. + }; + + assert(isAligned(Align(4), RelOffset)); + if (Flags.RefKind == RefKindFlags::InternalRef) { + // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this + // without padding. + if (!isAligned(Align(8), RelOffset)) + GrowSizeFieldsBy4B(); + + assert(isAligned(Align(8), RelOffset)); + RefsRelOffset = RelOffset; + RelOffset += 8 * NumRefs; + } else { + // The array of 4B refs doesn't need 8B alignment, but the data will need + // to be 8B-aligned. Detect this now, and, if necessary, shift everything + // by 4B by growing one of the sizes. + // If we remove the need for 8B-alignment for data there is <1% savings in + // disk storage for a clang build using MCCAS but the 8B-alignment may be + // useful in the future so keep it for now. + uint64_t RefListSize = 4 * NumRefs; + if (!isAligned(Align(8), RelOffset + RefListSize)) + GrowSizeFieldsBy4B(); + RefsRelOffset = RelOffset; + RelOffset += RefListSize; + } + + assert(isAligned(Align(8), RelOffset)); + DataRelOffset = RelOffset; +} + +uint64_t DataRecordHandle::getDataSize() const { + int64_t RelOffset = sizeof(Header); + auto *DataSizePtr = reinterpret_cast(H) + RelOffset; + switch (getLayoutFlags().DataSize) { + case DataSizeFlags::Uses1B: + return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX; + case DataSizeFlags::Uses2B: + return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) & + UINT16_MAX; + case DataSizeFlags::Uses4B: + return support::endian::read32le(DataSizePtr); + case DataSizeFlags::Uses8B: + return support::endian::read64le(DataSizePtr); + } +} + +void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const { + if (LF.DataSize >= DataSizeFlags::Uses4B) + RelOffset += 4; + if (LF.DataSize >= DataSizeFlags::Uses8B) + RelOffset += 4; +} + +uint32_t DataRecordHandle::getNumRefs() const { + LayoutFlags LF = getLayoutFlags(); + int64_t RelOffset = sizeof(Header); + skipDataSize(LF, RelOffset); + auto *NumRefsPtr = reinterpret_cast(H) + RelOffset; + switch (LF.NumRefs) { + case NumRefsFlags::Uses0B: + return 0; + case NumRefsFlags::Uses1B: + return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX; + case NumRefsFlags::Uses2B: + return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) & + UINT16_MAX; + case NumRefsFlags::Uses4B: + return support::endian::read32le(NumRefsPtr); + case NumRefsFlags::Uses8B: + return support::endian::read64le(NumRefsPtr); + } +} + +void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const { + if (LF.NumRefs >= NumRefsFlags::Uses4B) + RelOffset += 4; + if (LF.NumRefs >= NumRefsFlags::Uses8B) + RelOffset += 4; +} + +int64_t DataRecordHandle::getRefsRelOffset() const { + LayoutFlags LF = getLayoutFlags(); + int64_t RelOffset = sizeof(Header); + skipDataSize(LF, RelOffset); + skipNumRefs(LF, RelOffset); + return RelOffset; +} + +int64_t DataRecordHandle::getDataRelOffset() const { + LayoutFlags LF = getLayoutFlags(); + int64_t RelOffset = sizeof(Header); + skipDataSize(LF, RelOffset); + skipNumRefs(LF, RelOffset); + uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8; + RelOffset += RefSize * getNumRefs(); + return RelOffset; +} + +void OnDiskGraphDB::print(raw_ostream &OS) const { + OS << "on-disk-root-path: " << RootPath << "\n"; + + struct PoolInfo { + int64_t Offset; + }; + SmallVector Pool; + + OS << "\n"; + OS << "index:\n"; + Index.print(OS, [&](ArrayRef Data) { + assert(Data.size() == sizeof(TrieRecord)); + assert(isAligned(Align::Of(), Data.size())); + auto *R = reinterpret_cast(Data.data()); + TrieRecord::Data D = R->load(); + OS << " SK="; + switch (D.SK) { + case TrieRecord::StorageKind::Unknown: + OS << "unknown "; + break; + case TrieRecord::StorageKind::DataPool: + OS << "datapool "; + Pool.push_back({D.Offset.get()}); + break; + case TrieRecord::StorageKind::Standalone: + OS << "standalone-data "; + break; + case TrieRecord::StorageKind::StandaloneLeaf: + OS << "standalone-leaf "; + break; + case TrieRecord::StorageKind::StandaloneLeaf0: + OS << "standalone-leaf+0"; + break; + } + OS << " Offset=" << (void *)D.Offset.get(); + }); + if (Pool.empty()) + return; + + OS << "\n"; + OS << "pool:\n"; + llvm::sort( + Pool, [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; }); + for (PoolInfo PI : Pool) { + OS << "- addr=" << (void *)PI.Offset << " "; + DataRecordHandle D = + DataRecordHandle::get(DataPool.beginData(FileOffset(PI.Offset))); + OS << "record refs=" << D.getNumRefs() << " data=" << D.getDataSize() + << " size=" << D.getTotalSize() + << " end=" << (void *)(PI.Offset + D.getTotalSize()) << "\n"; + } +} + +OnDiskGraphDB::IndexProxy OnDiskGraphDB::indexHash(ArrayRef Hash) { + OnDiskHashMappedTrie::pointer P = Index.insertLazy( + Hash, [](FileOffset TentativeOffset, + OnDiskHashMappedTrie::ValueProxy TentativeValue) { + assert(TentativeValue.Data.size() == sizeof(TrieRecord)); + assert( + isAddrAligned(Align::Of(), TentativeValue.Data.data())); + new (TentativeValue.Data.data()) TrieRecord(); + }); + assert(P && "Expected insertion"); + return getIndexProxyFromPointer(P); +} + +OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer( + OnDiskHashMappedTrie::const_pointer P) const { + assert(P); + assert(P.getOffset()); + return IndexProxy{P.getOffset(), P->Hash, + *const_cast( + reinterpret_cast(P->Data.data()))}; +} + +ObjectID OnDiskGraphDB::getReference(ArrayRef Hash) { + IndexProxy I = indexHash(Hash); + return getExternalReference(I); +} + +ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) { + return getExternalReference(makeInternalRef(I.Offset)); +} + +std::optional +OnDiskGraphDB::getExistingReference(ArrayRef Digest) { + auto tryUpstream = + [&](std::optional I) -> std::optional { + if (!UpstreamDB) + return std::nullopt; + std::optional UpstreamID = + UpstreamDB->getExistingReference(Digest); + if (!UpstreamID) + return std::nullopt; + if (!I) + I.emplace(indexHash(Digest)); + return getExternalReference(*I); + }; + + OnDiskHashMappedTrie::const_pointer P = Index.find(Digest); + if (!P) + return tryUpstream(std::nullopt); + IndexProxy I = getIndexProxyFromPointer(P); + TrieRecord::Data Obj = I.Ref.load(); + if (Obj.SK == TrieRecord::StorageKind::Unknown) + return tryUpstream(I); + return getExternalReference(makeInternalRef(I.Offset)); +} + +OnDiskGraphDB::IndexProxy +OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const { + OnDiskHashMappedTrie::const_pointer P = + Index.recoverFromFileOffset(Ref.getFileOffset()); + if (LLVM_UNLIKELY(!P)) + report_fatal_error("OnDiskCAS: corrupt internal reference"); + return getIndexProxyFromPointer(P); +} + +ArrayRef OnDiskGraphDB::getDigest(InternalRef Ref) const { + IndexProxy I = getIndexProxyFromRef(Ref); + return I.Hash; +} + +ArrayRef OnDiskGraphDB::getDigest(const IndexProxy &I) const { + return I.Hash; +} + +ArrayRef OnDiskGraphDB::getObjectData(ObjectHandle Node) const { + OnDiskContent Content = getContentFromHandle(Node); + if (Content.Bytes) + return *Content.Bytes; + assert(Content.Record && "Expected record or bytes"); + return Content.Record->getData(); +} + +InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const { + if (std::optional Record = + getContentFromHandle(Node).Record) + return Record->getRefs(); + return std::nullopt; +} + +Expected> +OnDiskGraphDB::load(ObjectID ExternalRef) { + InternalRef Ref = getInternalRef(ExternalRef); + IndexProxy I = getIndexProxyFromRef(Ref); + TrieRecord::Data Object = I.Ref.load(); + + if (Object.SK == TrieRecord::StorageKind::Unknown) { + if (!UpstreamDB) + return std::nullopt; + return faultInFromUpstream(ExternalRef); + } + + auto toObjectHandle = [](InternalHandle H) -> ObjectHandle { + return ObjectHandle::fromOpaqueData(H.getRawData()); + }; + + if (Object.SK == TrieRecord::StorageKind::DataPool) + return toObjectHandle(InternalHandle(Object.Offset)); + + // Only TrieRecord::StorageKind::Standalone (and variants) need to be + // explicitly loaded. + // + // There's corruption if standalone objects have offsets, or if we get here + // for something that isn't standalone. + if (Object.Offset) + return createCorruptObjectError(getDigest(I)); + switch (Object.SK) { + case TrieRecord::StorageKind::Unknown: + case TrieRecord::StorageKind::DataPool: + llvm_unreachable("unexpected storage kind"); + case TrieRecord::StorageKind::Standalone: + case TrieRecord::StorageKind::StandaloneLeaf0: + case TrieRecord::StorageKind::StandaloneLeaf: + break; + } + + // Load it from disk. + // + // Note: Creation logic guarantees that data that needs null-termination is + // suitably 0-padded. Requiring null-termination here would be too expensive + // for extremely large objects that happen to be page-aligned. + SmallString<256> Path; + getStandalonePath(TrieRecord::getStandaloneFileSuffix(Object.SK), I, Path); + ErrorOr> OwnedBuffer = MemoryBuffer::getFile( + Path, /*IsText=*/false, /*RequiresNullTerminator=*/false); + if (!OwnedBuffer) + return createCorruptObjectError(getDigest(I)); + + return toObjectHandle( + InternalHandle(static_cast(StandaloneData) + ->insert(I.Hash, Object.SK, std::move(*OwnedBuffer)))); +} + +bool OnDiskGraphDB::containsObject(ObjectID ExternalRef, + bool CheckUpstream) const { + InternalRef Ref = getInternalRef(ExternalRef); + IndexProxy I = getIndexProxyFromRef(Ref); + TrieRecord::Data Object = I.Ref.load(); + if (Object.SK != TrieRecord::StorageKind::Unknown) + return true; + if (!CheckUpstream || !UpstreamDB) + return false; + std::optional UpstreamID = + UpstreamDB->getExistingReference(getDigest(I)); + return UpstreamID.has_value(); +} + +InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) { + return InternalRef::getFromOffset(IndexOffset); +} + +void OnDiskGraphDB::getStandalonePath(StringRef Suffix, const IndexProxy &I, + SmallVectorImpl &Path) const { + Path.assign(RootPath.begin(), RootPath.end()); + sys::path::append(Path, FilePrefix + Twine(I.Offset.get()) + Suffix); +} + +OnDiskContent OnDiskGraphDB::getContentFromHandle(ObjectHandle OH) const { + auto getInternalHandle = [](ObjectHandle Handle) -> InternalHandle { + uint64_t Data = Handle.getOpaqueData(); + if (Data & 1) + return InternalHandle(*reinterpret_cast( + Data & (-1ULL << 1))); + return InternalHandle(Data); + }; + + InternalHandle Handle = getInternalHandle(OH); + if (Handle.SDIM) + return Handle.SDIM->getContent(); + + auto DataHandle = + DataRecordHandle::get(DataPool.beginData(Handle.getAsFileOffset())); + assert(DataHandle.getData().end()[0] == 0 && "Null termination"); + return OnDiskContent{DataHandle, std::nullopt}; +} + +OnDiskContent StandaloneDataInMemory::getContent() const { + bool Leaf0 = false; + bool Leaf = false; + switch (SK) { + default: + llvm_unreachable("Storage kind must be standalone"); + case TrieRecord::StorageKind::Standalone: + break; + case TrieRecord::StorageKind::StandaloneLeaf0: + Leaf = Leaf0 = true; + break; + case TrieRecord::StorageKind::StandaloneLeaf: + Leaf = true; + break; + } + + if (Leaf) { + assert(Region->getBuffer().drop_back(Leaf0).end()[0] == 0 && + "Standalone node data missing null termination"); + return OnDiskContent{ + std::nullopt, + arrayRefFromStringRef(Region->getBuffer().drop_back(Leaf0))}; + } + + DataRecordHandle Record = DataRecordHandle::get(Region->getBuffer().data()); + assert(Record.getData().end()[0] == 0 && + "Standalone object record missing null termination for data"); + return OnDiskContent{Record, std::nullopt}; +} + +Expected +OnDiskGraphDB::createTempFile(StringRef FinalPath, uint64_t Size) { + assert(Size && "Unexpected request for an empty temp file"); + Expected File = TempFile::create(FinalPath + ".%%%%%%"); + if (!File) + return File.takeError(); + + if (auto EC = sys::fs::resize_file_before_mapping_readwrite(File->FD, Size)) + return createFileError(File->TmpName, EC); + + std::error_code EC; + sys::fs::mapped_file_region Map(sys::fs::convertFDToNativeFile(File->FD), + sys::fs::mapped_file_region::readwrite, Size, + 0, EC); + if (EC) + return createFileError(File->TmpName, EC); + return MappedTempFile(std::move(*File), std::move(Map)); +} + +static size_t getPageSize() { + static int PageSize = sys::Process::getPageSizeEstimate(); + return PageSize; +} + +Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef Data) { + assert(Data.size() > TrieRecord::MaxEmbeddedSize && + "Expected a bigger file for external content..."); + + bool Leaf0 = isAligned(Align(getPageSize()), Data.size()); + TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0 + : TrieRecord::StorageKind::StandaloneLeaf; + + SmallString<256> Path; + int64_t FileSize = Data.size() + Leaf0; + getStandalonePath(TrieRecord::getStandaloneFileSuffix(SK), I, Path); + + // Write the file. Don't reuse this mapped_file_region, which is read/write. + // Let load() pull up one that's read-only. + Expected File = createTempFile(Path, FileSize); + if (!File) + return File.takeError(); + assert(File->size() == (uint64_t)FileSize); + llvm::copy(Data, File->data()); + if (Leaf0) + File->data()[Data.size()] = 0; + assert(File->data()[Data.size()] == 0); + if (Error E = File->keep(Path)) + return E; + + // Store the object reference. + TrieRecord::Data Existing; + { + TrieRecord::Data Leaf{SK, FileOffset()}; + if (I.Ref.compare_exchange_strong(Existing, Leaf)) { + recordStandaloneSizeIncrease(FileSize); + return Error::success(); + } + } + + // If there was a race, confirm that the new value has valid storage. + if (Existing.SK == TrieRecord::StorageKind::Unknown) + return createCorruptObjectError(getDigest(I)); + + return Error::success(); +} + +Error OnDiskGraphDB::store(ObjectID ID, ArrayRef Refs, + ArrayRef Data) { + IndexProxy I = getIndexProxyFromRef(getInternalRef(ID)); + + // Early return in case the node exists. + { + TrieRecord::Data Existing = I.Ref.load(); + if (Existing.SK != TrieRecord::StorageKind::Unknown) + return Error::success(); + } + + // Big leaf nodes. + if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize) + return createStandaloneLeaf(I, Data); + + // TODO: Check whether it's worth checking the index for an already existing + // object (like storeTreeImpl() does) before building up the + // InternalRefVector. + InternalRefVector InternalRefs; + for (ObjectID Ref : Refs) + InternalRefs.push_back(getInternalRef(Ref)); + + // Create the object. + + DataRecordHandle::Input Input{InternalRefs, Data}; + + // Compute the storage kind, allocate it, and create the record. + TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown; + FileOffset PoolOffset; + SmallString<256> Path; + std::optional File; + std::optional FileSize; + auto Alloc = [&](size_t Size) -> Expected { + if (Size <= TrieRecord::MaxEmbeddedSize) { + SK = TrieRecord::StorageKind::DataPool; + OnDiskDataAllocator::pointer P = DataPool.allocate(Size); + PoolOffset = P.getOffset(); + LLVM_DEBUG({ + dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get() + << " size=" << Size + << " end=" << (void *)(PoolOffset.get() + Size) << "\n"; + }); + return P->data(); + } + + SK = TrieRecord::StorageKind::Standalone; + getStandalonePath(TrieRecord::getStandaloneFileSuffix(SK), I, Path); + if (Error E = createTempFile(Path, Size).moveInto(File)) + return std::move(E); + assert(File->size() == Size); + FileSize = Size; + return File->data(); + }; + DataRecordHandle Record; + if (Error E = + DataRecordHandle::createWithError(Alloc, Input).moveInto(Record)) + return E; + assert(Record.getData().end()[0] == 0 && "Expected null-termination"); + assert(Record.getData() == Input.Data && "Expected initialization"); + assert(SK != TrieRecord::StorageKind::Unknown); + assert(bool(File) != bool(PoolOffset) && + "Expected either a mapped file or a pooled offset"); + + // Check for a race before calling MappedTempFile::keep(). + // + // Then decide what to do with the file. Better to discard than overwrite if + // another thread/process has already added this. + TrieRecord::Data Existing = I.Ref.load(); + { + TrieRecord::Data NewObject{SK, PoolOffset}; + if (File) { + if (Existing.SK == TrieRecord::StorageKind::Unknown) { + // Keep the file! + if (Error E = File->keep(Path)) + return E; + } else { + File.reset(); + } + } + + // If we didn't already see a racing/existing write, then try storing the + // new object. If that races, confirm that the new value has valid storage. + // + // TODO: Find a way to reuse the storage from the new-but-abandoned record + // handle. + if (Existing.SK == TrieRecord::StorageKind::Unknown) { + if (I.Ref.compare_exchange_strong(Existing, NewObject)) { + if (FileSize) + recordStandaloneSizeIncrease(*FileSize); + return Error::success(); + } + } + } + + if (Existing.SK == TrieRecord::StorageKind::Unknown) + return createCorruptObjectError(getDigest(I)); + + // Load existing object. + return Error::success(); +} + +void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) { + getStandaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed); +} + +std::atomic &OnDiskGraphDB::getStandaloneStorageSize() { + MutableArrayRef UserHeader = DataPool.getUserHeader(); + assert(UserHeader.size() == sizeof(std::atomic)); + assert(isAddrAligned(Align(8), UserHeader.data())); + return *reinterpret_cast *>(UserHeader.data()); +} + +uint64_t OnDiskGraphDB::getStandaloneStorageSize() const { + return const_cast(this)->getStandaloneStorageSize().load( + std::memory_order_relaxed); +} + +size_t OnDiskGraphDB::getStorageSize() const { + return Index.size() + DataPool.size() + getStandaloneStorageSize(); +} + +Expected> OnDiskGraphDB::open( + StringRef AbsPath, StringRef HashName, unsigned HashByteSize, + std::unique_ptr UpstreamDB, FaultInPolicy Policy) { + if (std::error_code EC = sys::fs::create_directories(AbsPath)) + return createFileError(AbsPath, EC); + + const StringRef Slash = sys::path::get_separator(); + constexpr uint64_t MB = 1024ull * 1024ull; + constexpr uint64_t GB = 1024ull * 1024ull * 1024ull; + + uint64_t MaxIndexSize = 8 * GB; + uint64_t MaxDataPoolSize = 16 * GB; + + auto CustomSize = getOverriddenMaxMappingSize(); + if (!CustomSize) + return CustomSize.takeError(); + if (*CustomSize) + MaxIndexSize = MaxDataPoolSize = **CustomSize; + + std::optional Index; + if (Error E = + OnDiskHashMappedTrie::create( + AbsPath + Slash + FilePrefix + IndexFile, + IndexTableName + "[" + HashName + "]", HashByteSize * CHAR_BIT, + /*DataSize=*/sizeof(TrieRecord), MaxIndexSize, /*MinFileSize=*/MB) + .moveInto(Index)) + return std::move(E); + + uint32_t UserHeaderSize = sizeof(std::atomic); + std::optional DataPool; + StringRef PolicyName = + Policy == FaultInPolicy::SingleNode ? "single" : "full"; + if (Error E = OnDiskDataAllocator::create( + AbsPath + Slash + FilePrefix + DataPoolFile, + DataPoolTableName + "[" + HashName + "]" + PolicyName, + MaxDataPoolSize, /*MinFileSize=*/MB, UserHeaderSize, + [](void *UserHeaderPtr) { + new (UserHeaderPtr) std::atomic(0); + }) + .moveInto(DataPool)) + return std::move(E); + if (DataPool->getUserHeader().size() != UserHeaderSize) + return createStringError(llvm::errc::argument_out_of_domain, + "unexpected user header in '" + AbsPath + Slash + + FilePrefix + DataPoolFile + "'"); + + return std::unique_ptr( + new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool), + std::move(UpstreamDB), Policy)); +} + +OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskHashMappedTrie Index, + OnDiskDataAllocator DataPool, + std::unique_ptr UpstreamDB, + FaultInPolicy Policy) + : Index(std::move(Index)), DataPool(std::move(DataPool)), + RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)), + FIPolicy(Policy) { + /// Lifetime for "big" objects not in DataPool. + /// + /// NOTE: Could use ThreadSafeHashMappedTrie here. For now, doing something + /// simpler on the assumption there won't be much contention since most data + /// is not big. If there is contention, and we've already fixed ObjectProxy + /// object handles to be cheap enough to use consistently, the fix might be + /// to use better use of them rather than optimizing this map. + /// + /// FIXME: Figure out the right number of shards, if any. + StandaloneData = new StandaloneDataMapTy(); +} + +OnDiskGraphDB::~OnDiskGraphDB() { + delete static_cast(StandaloneData); +} + +Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID, + ObjectHandle UpstreamNode) { + // Copies the full CAS tree from upstream. Uses depth-first copying to protect + // against the process dying during importing and leaving the database with an + // incomplete tree. Note that if the upstream has missing nodes then the tree + // will be copied with missing nodes as well, it won't be considered an error. + + struct UpstreamCursor { + ObjectHandle Node; + size_t RefsCount; + object_refs_iterator RefI; + object_refs_iterator RefE; + }; + /// Keeps track of the state of visitation for current node and all of its + /// parents. + SmallVector CursorStack; + /// Keeps track of the currently visited nodes as they are imported into + /// primary database, from current node and its parents. When a node is + /// entered for visitation it appends its own ID, then appends referenced IDs + /// as they get imported. When a node is fully imported it removes the + /// referenced IDs from the bottom of the stack which leaves its own ID at the + /// bottom, adding to the list of referenced IDs for the parent node. + SmallVector PrimaryNodesStack; + + auto enqueueNode = [&](ObjectID PrimaryID, std::optional Node) { + PrimaryNodesStack.push_back(PrimaryID); + if (!Node) + return; + auto Refs = UpstreamDB->getObjectRefs(*Node); + CursorStack.push_back({*Node, + (size_t)std::distance(Refs.begin(), Refs.end()), + Refs.begin(), Refs.end()}); + }; + + enqueueNode(PrimaryID, UpstreamNode); + + while (!CursorStack.empty()) { + UpstreamCursor &Cur = CursorStack.back(); + if (Cur.RefI == Cur.RefE) { + // Copy the node data into the primary store. + // FIXME: Use hard-link or cloning if the file-system supports it and data + // is stored into a separate file. + + // The bottom of \p PrimaryNodesStack contains the primary ID for the + // current node plus the list of imported referenced IDs. + assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1); + ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1); + auto PrimaryRefs = ArrayRef(PrimaryNodesStack) + .slice(PrimaryNodesStack.size() - Cur.RefsCount); + auto Data = UpstreamDB->getObjectData(Cur.Node); + if (Error E = store(PrimaryID, PrimaryRefs, Data)) + return E; + // Remove the current node and its IDs from the stack. + PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount); + CursorStack.pop_back(); + continue; + } + + ObjectID UpstreamID = *(Cur.RefI++); + ObjectID PrimaryID = getReference(UpstreamDB->getDigest(UpstreamID)); + if (containsObject(PrimaryID, /*CheckUpstream=*/false)) { + // This \p ObjectID already exists in the primary. Either it was imported + // via \p importFullTree or the client created it, in which case the + // client takes responsibility for how it was formed. + enqueueNode(PrimaryID, std::nullopt); + continue; + } + Expected> UpstreamNode = + UpstreamDB->load(UpstreamID); + if (!UpstreamNode) + return UpstreamNode.takeError(); + enqueueNode(PrimaryID, *UpstreamNode); + } + + assert(PrimaryNodesStack.size() == 1); + assert(PrimaryNodesStack.front() == PrimaryID); + return Error::success(); +} + +Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, + ObjectHandle UpstreamNode) { + // Copies only a single node, it doesn't copy the referenced nodes. + + // Copy the node data into the primary store. + // FIXME: Use hard-link or cloning if the file-system supports it and data is + // stored into a separate file. + + auto Data = UpstreamDB->getObjectData(UpstreamNode); + auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode); + SmallVector Refs; + Refs.reserve(std::distance(UpstreamRefs.begin(), UpstreamRefs.end())); + for (ObjectID UpstreamRef : UpstreamRefs) + Refs.push_back(getReference(UpstreamDB->getDigest(UpstreamRef))); + + return store(PrimaryID, Refs, Data); +} + +Expected> +OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) { + assert(UpstreamDB); + + ObjectID UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID)); + Expected> UpstreamNode = + UpstreamDB->load(UpstreamID); + if (!UpstreamNode) + return UpstreamNode.takeError(); + if (!*UpstreamNode) + return std::nullopt; + + if (Error E = FIPolicy == FaultInPolicy::SingleNode + ? importSingleNode(PrimaryID, **UpstreamNode) + : importFullTree(PrimaryID, **UpstreamNode)) + return std::move(E); + return load(PrimaryID); +} diff --git a/llvm/lib/CAS/OnDiskHashMappedTrie.cpp b/llvm/lib/CAS/OnDiskHashMappedTrie.cpp new file mode 100644 index 00000000000000..09fef70ee7fc06 --- /dev/null +++ b/llvm/lib/CAS/OnDiskHashMappedTrie.cpp @@ -0,0 +1,1356 @@ +//===- OnDiskHashMappedTrie.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskHashMappedTrie.h" +#include "HashMappedTrieIndexGenerator.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/CAS/MappedFileRegionBumpPtr.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::cas; + +#if LLVM_ENABLE_ONDISK_CAS + +static_assert(sizeof(size_t) == sizeof(uint64_t), "64-bit only"); +static_assert(sizeof(std::atomic) == sizeof(uint64_t), + "Requires lock-free 64-bit atomics"); + +//===----------------------------------------------------------------------===// +// Generic database data structures. +//===----------------------------------------------------------------------===// +namespace { +using MappedFileRegion = MappedFileRegionBumpPtr::RegionT; + +/// Generic handle for a table. +/// +/// Probably we want some table kinds for pointing at multiple tables. +/// - Probably a tree or trie type makes sense. +/// - Or a deque. Linear search is okay as long as there aren't many tables in +/// a file. +/// +/// Generic table header layout: +/// - 2-bytes: TableKind +/// - 2-bytes: TableNameSize +/// - 4-bytes: TableNameRelOffset (relative to header) +class TableHandle { +public: + enum class TableKind : uint16_t { + HashMappedTrie = 1, + DataAllocator = 2, + }; + struct Header { + TableKind Kind; + uint16_t NameSize; + int32_t NameRelOffset; // Relative to Header. + }; + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + MappedFileRegion &getRegion() const { return *Region; } + + template static void check() { + static_assert( + std::is_same::value, + "T::GenericHeader should be of type TableHandle::Header"); + static_assert(offsetof(typename T::Header, GenericHeader) == 0, + "T::GenericHeader must be the head of T::Header"); + } + template bool is() const { return T::Kind == H->Kind; } + template T dyn_cast() const { + check(); + if (is()) + return T(*Region, *reinterpret_cast(H)); + return T(); + } + template T cast() const { + assert(is()); + return dyn_cast(); + } + + StringRef getName() const { + auto *Begin = reinterpret_cast(H) + H->NameRelOffset; + return StringRef(Begin, H->NameSize); + } + + TableHandle() = default; + TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {} + TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset) + : TableHandle(Region, + *reinterpret_cast
(Region.data() + HeaderOffset)) { + } + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; +}; + +/// Encapsulate a database file, which: +/// - Sets/checks magic. +/// - Sets/checks version. +/// - Points at an arbitrary root table (can be changed later using a lock-free +/// algorithm). +/// - Sets up a BumpPtr for allocation. +/// +/// Top-level layout: +/// - 8-bytes: Magic +/// - 8-bytes: Version +/// - 8-bytes: RootTable (16-bits: Kind; 48-bits: Offset) +/// - 8-bytes: BumpPtr +class DatabaseFile { +public: + static constexpr uint64_t getMagic() { return 0x00FFDA7ABA53FF00ULL; } + static constexpr uint64_t getVersion() { return 1ULL; } + struct Header { + uint64_t Magic; + uint64_t Version; + std::atomic RootTableOffset; + std::atomic BumpPtr; + }; + + const Header &getHeader() { return *H; } + MappedFileRegionBumpPtr &getAlloc() { return Alloc; } + MappedFileRegion &getRegion() { return Alloc.getRegion(); } + + /// Add a table. + /// + /// TODO: Allow lazy construction via getOrCreate()-style API. + void addTable(TableHandle Table); + + /// Find a table. May return null. + std::optional findTable(StringRef Name); + + static Expected + create(const Twine &Path, uint64_t Capacity, + function_ref NewDBConstructor); + + size_t size() const { return Alloc.size(); } + +private: + static Expected + get(std::shared_ptr Alloc) { + if (Error E = validate(Alloc->getRegion())) + return std::move(E); + return DatabaseFile(std::move(Alloc)); + } + + static Error validate(MappedFileRegion &Region); + + DatabaseFile(MappedFileRegionBumpPtr &Alloc) + : H(reinterpret_cast
(Alloc.data())), Alloc(Alloc) {} + DatabaseFile(std::shared_ptr Alloc) + : DatabaseFile(*Alloc) { + OwnedAlloc = std::move(Alloc); + } + + Header *H = nullptr; + MappedFileRegionBumpPtr &Alloc; + std::shared_ptr OwnedAlloc; +}; + +} // end anonymous namespace + +Expected +DatabaseFile::create(const Twine &Path, uint64_t Capacity, + function_ref NewDBConstructor) { + // Constructor for if the file doesn't exist. + auto NewFileConstructor = [&](MappedFileRegionBumpPtr &Alloc) -> Error { + assert(Alloc.capacity() >= sizeof(Header)); + (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}, {0}}; + Alloc.initializeBumpPtr(offsetof(Header, BumpPtr)); + DatabaseFile DB(Alloc); + return NewDBConstructor(DB); + }; + + // Get or create the file. + std::shared_ptr Alloc; + if (Error E = MappedFileRegionBumpPtr::createShared(Path, Capacity, + offsetof(Header, BumpPtr), + NewFileConstructor) + .moveInto(Alloc)) + return std::move(E); + + return DatabaseFile::get(std::move(Alloc)); +} + +void DatabaseFile::addTable(TableHandle Table) { + assert(Table); + assert(&Table.getRegion() == &getRegion()); + int64_t ExistingRootOffset = 0; + const int64_t NewOffset = + reinterpret_cast(&Table.getHeader()) - getRegion().data(); + if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset)) + return; + + // Silently ignore attempts to set the root to itself. + if (ExistingRootOffset == NewOffset) + return; + + // FIXME: Fix the API so that having the same name is not an error. Instead, + // the colliding table should just be used as-is and the client can decide + // what to do with the new one. + // + // TODO: Add support for creating a chain or tree of tables (more than one at + // all!) to avoid this error. + TableHandle Root(getRegion(), ExistingRootOffset); + if (Root.getName() == Table.getName()) + report_fatal_error( + createStringError(make_error_code(std::errc::not_supported), + "table name collision '" + Table.getName() + "'")); + else + report_fatal_error( + createStringError(make_error_code(std::errc::not_supported), + "cannot add new table '" + Table.getName() + + "'" + " to existing root '" + + Root.getName() + "'")); +} + +std::optional DatabaseFile::findTable(StringRef Name) { + int64_t RootTableOffset = H->RootTableOffset.load(); + if (!RootTableOffset) + return std::nullopt; + + TableHandle Root(getRegion(), RootTableOffset); + if (Root.getName() == Name) + return Root; + + // TODO: Once multiple tables are supported, need to walk to find them. + return std::nullopt; +} + +Error DatabaseFile::validate(MappedFileRegion &Region) { + if (Region.size() < sizeof(Header)) + return createStringError(std::errc::invalid_argument, + "database: missing header"); + + // Check the magic and version. + auto *H = reinterpret_cast
(Region.data()); + if (H->Magic != getMagic()) + return createStringError(std::errc::invalid_argument, + "database: bad magic"); + if (H->Version != getVersion()) + return createStringError(std::errc::invalid_argument, + "database: wrong version"); + + // Check the bump-ptr, which should point past the header. + if (H->BumpPtr.load() < (int64_t)sizeof(Header)) + return createStringError(std::errc::invalid_argument, + "database: corrupt bump-ptr"); + + return Error::success(); +} + +//===----------------------------------------------------------------------===// +// HashMappedTrie data structures. +//===----------------------------------------------------------------------===// + +namespace { + +class SubtrieHandle; +class SubtrieSlotValue { +public: + explicit operator bool() const { return !isEmpty(); } + bool isEmpty() const { return !Offset; } + bool isData() const { return Offset > 0; } + bool isSubtrie() const { return Offset < 0; } + int64_t asData() const { + assert(isData()); + return Offset; + } + int64_t asSubtrie() const { + assert(isSubtrie()); + return -Offset; + } + + FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); } + + FileOffset asDataFileOffset() const { return FileOffset(asData()); } + + int64_t getRawOffset() const { return Offset; } + + static SubtrieSlotValue getDataOffset(int64_t Offset) { + return SubtrieSlotValue(Offset); + } + + static SubtrieSlotValue getSubtrieOffset(int64_t Offset) { + return SubtrieSlotValue(-Offset); + } + + static SubtrieSlotValue getDataOffset(FileOffset Offset) { + return getDataOffset(Offset.get()); + } + + static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) { + return getDataOffset(Offset.get()); + } + + static SubtrieSlotValue getFromSlot(std::atomic &Slot) { + return SubtrieSlotValue(Slot.load()); + } + + SubtrieSlotValue() = default; + +private: + friend class SubtrieHandle; + explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {} + int64_t Offset = 0; +}; + +class HashMappedTrieHandle; + +/// Subtrie layout: +/// - 2-bytes: StartBit +/// - 1-bytes: NumBits=lg(num-slots) +/// - 1-bytes: NumUnusedBits=lg(num-slots-unused) +/// - 4-bytes: 0-pad +/// - +class SubtrieHandle { +public: + struct Header { + /// The bit this subtrie starts on. + uint16_t StartBit; + + /// The number of bits this subtrie handles. It has 2^NumBits slots. + uint8_t NumBits; + + /// The number of extra bits this allocation *could* handle, due to + /// over-allocation. It has 2^NumUnusedBits unused slots. + uint8_t NumUnusedBits; + + /// 0-pad to 8B. + uint32_t ZeroPad4B; + }; + + /// Slot storage: + /// - zero: Empty + /// - positive: RecordOffset + /// - negative: SubtrieOffset + using SlotT = std::atomic; + + static int64_t getSlotsSize(uint32_t NumBits) { + return sizeof(int64_t) * (1u << NumBits); + } + + static int64_t getSize(uint32_t NumBits) { + return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits); + } + + int64_t getSize() const { return getSize(H->NumBits); } + + SubtrieSlotValue load(size_t I) const { + return SubtrieSlotValue(Slots[I].load()); + } + void store(size_t I, SubtrieSlotValue V) { + return Slots[I].store(V.getRawOffset()); + } + + void printHash(raw_ostream &OS, ArrayRef Bytes) const; + void print(raw_ostream &OS, HashMappedTrieHandle Trie, + SmallVectorImpl &Records, + std::optional Prefix = std::nullopt) const; + + /// Return None on success, or the existing offset on failure. + bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected, + SubtrieSlotValue New) { + return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset); + } + + /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with + /// \p NumSubtrieBits. + /// + /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a + /// new subtrie is created that isn't used because of a lost race, then it If + /// it's already valid, it should be used instead of allocating a new one. + /// should be returned as an out parameter to be passed back in the future. + /// If it's already valid, it should be used instead of allocating a new one. + /// + /// Returns the subtrie that now lives at \p I. + SubtrieHandle sink(size_t I, SubtrieSlotValue V, + MappedFileRegionBumpPtr &Alloc, size_t NumSubtrieBits, + SubtrieHandle &UnusedSubtrie, size_t NewI); + + /// Only safe if the subtrie is empty. + void reinitialize(uint32_t StartBit, uint32_t NumBits); + + SubtrieSlotValue getOffset() const { + return SubtrieSlotValue::getSubtrieOffset( + reinterpret_cast(H) - Region->data()); + } + + FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); } + + explicit operator bool() const { return H; } + + Header &getHeader() const { return *H; } + uint32_t getStartBit() const { return H->StartBit; } + uint32_t getNumBits() const { return H->NumBits; } + uint32_t getNumUnusedBits() const { return H->NumUnusedBits; } + + static SubtrieHandle create(MappedFileRegionBumpPtr &Alloc, uint32_t StartBit, + uint32_t NumBits, uint32_t NumUnusedBits = 0); + + static SubtrieHandle getFromFileOffset(MappedFileRegion &Region, + FileOffset Offset) { + return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset)); + } + + SubtrieHandle() = default; + SubtrieHandle(MappedFileRegion &Region, Header &H) + : Region(&Region), H(&H), Slots(getSlots(H)) {} + SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset) + : SubtrieHandle(Region, *reinterpret_cast
( + Region.data() + Offset.asSubtrie())) {} + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; + MutableArrayRef Slots; + + static MutableArrayRef getSlots(Header &H) { + return MutableArrayRef(reinterpret_cast(&H + 1), 1u << H.NumBits); + } +}; + +/// Handle for a HashMappedTrie table. +/// +/// HashMappedTrie table layout: +/// - [8-bytes: Generic table header] +/// - 1-byte: NumSubtrieBits +/// - 1-byte: Flags (not used yet) +/// - 2-bytes: NumHashBits +/// - 4-bytes: RecordDataSize (in bytes) +/// - 8-bytes: RootTrieOffset +/// - 8-bytes: AllocatorOffset (reserved for implementing free lists) +/// - '\0' +/// +/// Record layout: +/// - +/// - +class HashMappedTrieHandle { +public: + static constexpr TableHandle::TableKind Kind = + TableHandle::TableKind::HashMappedTrie; + + struct Header { + TableHandle::Header GenericHeader; + uint8_t NumSubtrieBits; + uint8_t Flags; // None used yet. + uint16_t NumHashBits; + uint32_t RecordDataSize; + std::atomic RootTrieOffset; + std::atomic AllocatorOffset; + }; + + operator TableHandle() const { + if (!H) + return TableHandle(); + return TableHandle(*Region, H->GenericHeader); + } + + struct RecordData { + OnDiskHashMappedTrie::ValueProxy Proxy; + SubtrieSlotValue Offset; + FileOffset getFileOffset() const { return Offset.asDataFileOffset(); } + }; + + enum Limits : size_t { + /// Seems like 65528 hash bits ought to be enough. + MaxNumHashBytes = UINT16_MAX >> 3, + MaxNumHashBits = MaxNumHashBytes << 3, + + /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit + /// index. This many slots is suspicously large anyway. + MaxNumRootBits = 16, + + /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously + /// large for subtries. + MaxNumSubtrieBits = 10, + }; + + static constexpr size_t getNumHashBytes(size_t NumHashBits) { + assert(NumHashBits % 8 == 0); + return NumHashBits / 8; + } + static constexpr size_t getRecordSize(size_t RecordDataSize, + size_t NumHashBits) { + return RecordDataSize + getNumHashBytes(NumHashBits); + } + + RecordData getRecord(SubtrieSlotValue Offset); + RecordData createRecord(MappedFileRegionBumpPtr &Alloc, + ArrayRef Hash); + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + SubtrieHandle getRoot() const; + SubtrieHandle getOrCreateRoot(MappedFileRegionBumpPtr &Alloc); + MappedFileRegion &getRegion() const { return *Region; } + + size_t getFlags() const { return H->Flags; } + uint64_t getNumSubtrieBits() const { return H->NumSubtrieBits; } + uint64_t getNumHashBits() const { return H->NumHashBits; } + size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); } + size_t getRecordDataSize() const { return H->RecordDataSize; } + size_t getRecordSize() const { + return getRecordSize(H->RecordDataSize, H->NumHashBits); + } + + IndexGenerator getIndexGen(SubtrieHandle Root, ArrayRef Hash) { + assert(Root.getStartBit() == 0); + assert(getNumHashBytes() == Hash.size()); + assert(getNumHashBits() == Hash.size() * 8); + return IndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash}; + } + + static HashMappedTrieHandle + create(MappedFileRegionBumpPtr &Alloc, StringRef Name, + std::optional NumRootBits, uint64_t NumSubtrieBits, + uint64_t NumHashBits, uint64_t RecordDataSize); + + void + print(raw_ostream &OS, + function_ref)> PrintRecordData = nullptr) const; + + HashMappedTrieHandle() = default; + HashMappedTrieHandle(MappedFileRegion &Region, Header &H) + : Region(&Region), H(&H) {} + HashMappedTrieHandle(MappedFileRegion &Region, intptr_t HeaderOffset) + : HashMappedTrieHandle( + Region, *reinterpret_cast
(Region.data() + HeaderOffset)) { + } + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; +}; + +} // end anonymous namespace + +struct OnDiskHashMappedTrie::ImplType { + DatabaseFile File; + HashMappedTrieHandle Trie; +}; + +SubtrieHandle SubtrieHandle::create(MappedFileRegionBumpPtr &Alloc, + uint32_t StartBit, uint32_t NumBits, + uint32_t NumUnusedBits) { + assert(StartBit <= HashMappedTrieHandle::MaxNumHashBits); + assert(NumBits <= UINT8_MAX); + assert(NumUnusedBits <= UINT8_MAX); + assert(NumBits + NumUnusedBits <= HashMappedTrieHandle::MaxNumRootBits); + + void *Mem = Alloc.allocate(getSize(NumBits + NumUnusedBits)); + auto *H = + new (Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits, + (uint8_t)NumUnusedBits, /*ZeroPad4B=*/0}; + SubtrieHandle S(Alloc.getRegion(), *H); + for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I) + new (I) SlotT(0); + return S; +} + +SubtrieHandle HashMappedTrieHandle::getRoot() const { + if (int64_t Root = H->RootTrieOffset) + return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root)); + return SubtrieHandle(); +} + +SubtrieHandle +HashMappedTrieHandle::getOrCreateRoot(MappedFileRegionBumpPtr &Alloc) { + assert(&Alloc.getRegion() == &getRegion()); + if (SubtrieHandle Root = getRoot()) + return Root; + + int64_t Race = 0; + SubtrieHandle LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits); + if (H->RootTrieOffset.compare_exchange_strong( + Race, LazyRoot.getOffset().asSubtrie())) + return LazyRoot; + + // There was a race. Return the other root. + // + // TODO: Avoid leaking the lazy root by storing it in an allocator. + return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race)); +} + +HashMappedTrieHandle +HashMappedTrieHandle::create(MappedFileRegionBumpPtr &Alloc, StringRef Name, + std::optional NumRootBits, + uint64_t NumSubtrieBits, uint64_t NumHashBits, + uint64_t RecordDataSize) { + // Allocate. + intptr_t Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1); + + // Construct the header and the name. + assert(Name.size() <= UINT16_MAX && "Expected smaller table name"); + assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits"); + assert(NumHashBits <= UINT16_MAX && "Expected valid hash size"); + assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name"); + auto *H = new (Alloc.getRegion().data() + Offset) + Header{{TableHandle::TableKind::HashMappedTrie, (uint16_t)Name.size(), + (uint32_t)sizeof(Header)}, + (uint8_t)NumSubtrieBits, + /*Flags=*/0, + (uint16_t)NumHashBits, + (uint32_t)RecordDataSize, + /*RootTrieOffset=*/{0}, + /*AllocatorOffset=*/{0}}; + char *NameStorage = reinterpret_cast(H + 1); + llvm::copy(Name, NameStorage); + NameStorage[Name.size()] = 0; + + // Construct a root trie, if requested. + HashMappedTrieHandle Trie(Alloc.getRegion(), *H); + if (NumRootBits) + H->RootTrieOffset = + SubtrieHandle::create(Alloc, 0, *NumRootBits).getOffset().asSubtrie(); + return Trie; +} + +HashMappedTrieHandle::RecordData +HashMappedTrieHandle::getRecord(SubtrieSlotValue Offset) { + char *Begin = Region->data() + Offset.asData(); + OnDiskHashMappedTrie::ValueProxy Proxy; + Proxy.Data = MutableArrayRef(Begin, getRecordDataSize()); + Proxy.Hash = ArrayRef(reinterpret_cast(Proxy.Data.end()), + getNumHashBytes()); + return RecordData{Proxy, Offset}; +} + +HashMappedTrieHandle::RecordData +HashMappedTrieHandle::createRecord(MappedFileRegionBumpPtr &Alloc, + ArrayRef Hash) { + assert(&Alloc.getRegion() == Region); + assert(Hash.size() == getNumHashBytes()); + RecordData Record = getRecord( + SubtrieSlotValue::getDataOffset(Alloc.allocateOffset(getRecordSize()))); + llvm::copy(Hash, const_cast(Record.Proxy.Hash.begin())); + return Record; +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::recoverFromHashPointer( + const uint8_t *HashBeginPtr) const { + // Record hashes occur immediately after data. Compute the beginning of the + // record and check for overflow. + const uintptr_t HashBegin = reinterpret_cast(HashBeginPtr); + const uintptr_t RecordBegin = HashBegin - Impl->Trie.getRecordSize(); + if (HashBegin < RecordBegin) + return const_pointer(); + + // Check that it'll be a positive offset. + const uintptr_t FileBegin = + reinterpret_cast(Impl->File.getRegion().data()); + if (RecordBegin < FileBegin) + return const_pointer(); + + // Good enough to form an offset. Continue checking there. + return recoverFromFileOffset(FileOffset(RecordBegin - FileBegin)); +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::recoverFromFileOffset(FileOffset Offset) const { + // Check alignment. + if (!isAligned(MappedFileRegionBumpPtr::getAlign(), Offset.get())) + return const_pointer(); + + // Check bounds. + // + // Note: There's no potential overflow when using \c uint64_t because Offset + // is in \c [0,INT64_MAX] and the record size is in \c [0,UINT32_MAX]. + assert(Offset.get() >= 0 && "Expected FileOffset constructor guarantee this"); + if ((uint64_t)Offset.get() + Impl->Trie.getRecordSize() > + Impl->File.getAlloc().size()) + return const_pointer(); + + // Looks okay... + HashMappedTrieHandle::RecordData D = + Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset)); + return const_pointer(D.getFileOffset(), D.Proxy); +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::find(ArrayRef Hash) const { + HashMappedTrieHandle Trie = Impl->Trie; + assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash"); + + SubtrieHandle S = Trie.getRoot(); + if (!S) + return const_pointer(); + + IndexGenerator IndexGen = Trie.getIndexGen(S, Hash); + size_t Index = IndexGen.next(); + for (;;) { + // Try to set the content. + SubtrieSlotValue V = S.load(Index); + if (!V) + return const_pointer(S.getFileOffset(), + HintT(this, Index, *IndexGen.StartBit)); + + // Check for an exact match. + if (V.isData()) { + HashMappedTrieHandle::RecordData D = Trie.getRecord(V); + return D.Proxy.Hash == Hash + ? const_pointer(D.getFileOffset(), D.Proxy) + : const_pointer(S.getFileOffset(), + HintT(this, Index, *IndexGen.StartBit)); + } + + Index = IndexGen.next(); + S = SubtrieHandle(Trie.getRegion(), V); + } +} + +/// Only safe if the subtrie is empty. +void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) { + assert(StartBit > H->StartBit); + assert(NumBits <= H->NumBits); + // Ideally would also assert that all slots are empty, but that's expensive. + + H->StartBit = StartBit; + H->NumBits = NumBits; +} + +OnDiskHashMappedTrie::pointer +OnDiskHashMappedTrie::insertLazy(const_pointer Hint, ArrayRef Hash, + LazyInsertOnConstructCB OnConstruct, + LazyInsertOnLeakCB OnLeak) { + HashMappedTrieHandle Trie = Impl->Trie; + assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash"); + + MappedFileRegionBumpPtr &Alloc = Impl->File.getAlloc(); + SubtrieHandle S = Trie.getOrCreateRoot(Alloc); + IndexGenerator IndexGen = Trie.getIndexGen(S, Hash); + + size_t Index; + if (std::optional H = Hint.getHint(*this)) { + S = SubtrieHandle::getFromFileOffset(Trie.getRegion(), Hint.getOffset()); + Index = IndexGen.hint(H->I, H->B); + } else { + Index = IndexGen.next(); + } + + // FIXME: Add non-assertion based checks for data corruption that would + // otherwise cause infinite loops in release builds, instead calling + // report_fatal_error(). + // + // Two loops are possible: + // - All bits used up in the IndexGenerator because subtries are somehow + // linked in a cycle. Could confirm that each subtrie's start-bit + // follows from the start-bit and num-bits of its parent. Could also check + // that the generator doesn't run out of bits. + // - Existing data matches tail of Hash but not the head (stored in an + // invalid spot). Probably a cheap way to check this too, but needs + // thought. + std::optional NewRecord; + SubtrieHandle UnusedSubtrie; + for (;;) { + SubtrieSlotValue Existing = S.load(Index); + + // Try to set it, if it's empty. + if (!Existing) { + if (!NewRecord) { + NewRecord = Trie.createRecord(Alloc, Hash); + if (OnConstruct) + OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy); + } + + if (S.compare_exchange_strong(Index, Existing, NewRecord->Offset)) + return pointer(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy); + + // Race means that Existing is no longer empty; fall through... + } + + if (Existing.isSubtrie()) { + S = SubtrieHandle(Trie.getRegion(), Existing); + Index = IndexGen.next(); + continue; + } + + // Check for an exact match. + HashMappedTrieHandle::RecordData ExistingRecord = Trie.getRecord(Existing); + if (ExistingRecord.Proxy.Hash == Hash) { + if (NewRecord && OnLeak) + OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy, + ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy); + return pointer(ExistingRecord.Offset.asDataFileOffset(), + ExistingRecord.Proxy); + } + + // Sink the existing content as long as the indexes match. + for (;;) { + size_t NextIndex = IndexGen.next(); + size_t NewIndexForExistingContent = + IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash); + + S = S.sink(Index, Existing, Alloc, IndexGen.getNumBits(), UnusedSubtrie, + NewIndexForExistingContent); + Index = NextIndex; + + // Found the difference. + if (NextIndex != NewIndexForExistingContent) + break; + } + } +} + +SubtrieHandle SubtrieHandle::sink(size_t I, SubtrieSlotValue V, + MappedFileRegionBumpPtr &Alloc, + size_t NumSubtrieBits, + SubtrieHandle &UnusedSubtrie, size_t NewI) { + SubtrieHandle NewS; + if (UnusedSubtrie) { + // Steal UnusedSubtrie and initialize it. + std::swap(NewS, UnusedSubtrie); + NewS.reinitialize(getStartBit() + getNumBits(), NumSubtrieBits); + } else { + // Allocate a new, empty subtrie. + NewS = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(), + NumSubtrieBits); + } + + NewS.store(NewI, V); + if (compare_exchange_strong(I, V, NewS.getOffset())) + return NewS; // Success! + + // Raced. + assert(V.isSubtrie() && "Expected racing sink() to add a subtrie"); + + // Wipe out the new slot so NewS can be reused and set the out parameter. + NewS.store(NewI, SubtrieSlotValue()); + UnusedSubtrie = NewS; + + // Return the subtrie added by the concurrent sink() call. + return SubtrieHandle(Alloc.getRegion(), V); +} + +void OnDiskHashMappedTrie::print( + raw_ostream &OS, function_ref)> PrintRecordData) const { + Impl->Trie.print(OS, PrintRecordData); +} + +static void printHexDigit(raw_ostream &OS, uint8_t Digit) { + if (Digit < 10) + OS << char(Digit + '0'); + else + OS << char(Digit - 10 + 'a'); +} + +static void printHexDigits(raw_ostream &OS, ArrayRef Bytes, + size_t StartBit, size_t NumBits) { + assert(StartBit % 4 == 0); + assert(NumBits % 4 == 0); + for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) { + uint8_t HexPair = Bytes[I / 8]; + uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf; + printHexDigit(OS, HexDigit); + } +} + +void HashMappedTrieHandle::print( + raw_ostream &OS, function_ref)> PrintRecordData) const { + OS << "hash-num-bits=" << getNumHashBits() + << " hash-size=" << getNumHashBytes() + << " record-data-size=" << getRecordDataSize() << "\n"; + SubtrieHandle Root = getRoot(); + + SmallVector Records; + if (Root) + Root.print(OS, *this, Records); + + if (Records.empty()) + return; + llvm::sort(Records); + OS << "records\n"; + for (int64_t Offset : Records) { + OS << "- addr=" << (void *)Offset << " "; + HashMappedTrieHandle Trie = *this; + HashMappedTrieHandle::RecordData Record = + Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset)); + if (PrintRecordData) { + PrintRecordData(Record.Proxy.Data); + } else { + OS << "bytes="; + ArrayRef Data( + reinterpret_cast(Record.Proxy.Data.data()), + Record.Proxy.Data.size()); + printHexDigits(OS, Data, 0, Data.size() * 8); + } + OS << "\n"; + } +} + +static void printBits(raw_ostream &OS, ArrayRef Bytes, size_t StartBit, + size_t NumBits) { + assert(StartBit + NumBits <= Bytes.size() * 8u); + for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) { + uint8_t Byte = Bytes[I / 8]; + size_t ByteOffset = I % 8; + if (size_t ByteShift = 8 - ByteOffset - 1) + Byte >>= ByteShift; + OS << (Byte & 0x1 ? '1' : '0'); + } +} + +void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef Bytes) const { + // afb[1c:00*01110*0]def + size_t EndBit = getStartBit() + getNumBits(); + size_t HashEndBit = Bytes.size() * 8u; + + size_t FirstBinaryBit = getStartBit() & ~0x3u; + printHexDigits(OS, Bytes, 0, FirstBinaryBit); + + size_t LastBinaryBit = (EndBit + 3u) & ~0x3u; + OS << "["; + printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit); + OS << "]"; + + printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit); +} + +static void appendIndexBits(std::string &Prefix, size_t Index, + size_t NumSlots) { + std::string Bits; + for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) { + Bits.push_back('0' + (Index & 0x1)); + Index >>= 1; + } + for (char Ch : llvm::reverse(Bits)) + Prefix += Ch; +} + +static void printPrefix(raw_ostream &OS, StringRef Prefix) { + while (Prefix.size() >= 4) { + uint8_t Digit; + bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit); + assert(!ErrorParsingBinary); + (void)ErrorParsingBinary; + printHexDigit(OS, Digit); + Prefix = Prefix.drop_front(4); + } + if (!Prefix.empty()) + OS << "[" << Prefix << "]"; +} + +void SubtrieHandle::print(raw_ostream &OS, HashMappedTrieHandle Trie, + SmallVectorImpl &Records, + std::optional Prefix) const { + if (!Prefix) { + OS << "root"; + Prefix.emplace(); + } else { + OS << "subtrie="; + printPrefix(OS, *Prefix); + } + + OS << " addr=" + << (void *)(reinterpret_cast(H) - Region->data()); + + const size_t NumSlots = Slots.size(); + OS << " num-slots=" << NumSlots << "\n"; + SmallVector Subs; + SmallVector Prefixes; + for (size_t I = 0, E = NumSlots; I != E; ++I) { + SubtrieSlotValue Slot = load(I); + if (!Slot) + continue; + OS << "- index="; + for (size_t Pad : {10, 100, 1000}) + if (I < Pad && NumSlots >= Pad) + OS << "0"; + OS << I << " "; + if (Slot.isSubtrie()) { + SubtrieHandle S(*Region, Slot); + std::string SubtriePrefix = *Prefix; + appendIndexBits(SubtriePrefix, I, NumSlots); + OS << "addr=" << (void *)Slot.asSubtrie(); + OS << " subtrie="; + printPrefix(OS, SubtriePrefix); + OS << "\n"; + Subs.push_back(S); + Prefixes.push_back(SubtriePrefix); + continue; + } + Records.push_back(Slot.asData()); + HashMappedTrieHandle::RecordData Record = Trie.getRecord(Slot); + OS << "addr=" << (void *)Record.getFileOffset().get(); + OS << " content="; + printHash(OS, Record.Proxy.Hash); + OS << "\n"; + } + for (size_t I = 0, E = Subs.size(); I != E; ++I) + Subs[I].print(OS, Trie, Records, Prefixes[I]); +} + +LLVM_DUMP_METHOD void OnDiskHashMappedTrie::dump() const { print(dbgs()); } + +static Error createTableConfigError(std::errc ErrC, StringRef Path, + StringRef TableName, const Twine &Msg) { + return createStringError(make_error_code(ErrC), + Path + "[" + TableName + "]: " + Msg); +} + +static Expected checkParameter(StringRef Label, size_t Max, + std::optional Value, + std::optional Default, + StringRef Path, StringRef TableName) { + assert(Value || Default); + assert(!Default || *Default <= Max); + if (!Value) + return *Default; + + if (*Value <= Max) + return *Value; + return createTableConfigError( + std::errc::argument_out_of_domain, Path, TableName, + "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")"); +} + +static Error checkTable(StringRef Label, size_t Expected, size_t Observed, + StringRef Path, StringRef TrieName) { + if (Expected == Observed) + return Error::success(); + return createTableConfigError(std::errc::invalid_argument, Path, TrieName, + "mismatched " + Label + + " (expected: " + Twine(Expected) + + ", observed: " + Twine(Observed) + ")"); +} + +size_t OnDiskHashMappedTrie::size() const { return Impl->File.size(); } + +Expected +OnDiskHashMappedTrie::create(const Twine &PathTwine, const Twine &TrieNameTwine, + size_t NumHashBits, uint64_t DataSize, + uint64_t MaxFileSize, + std::optional NewFileInitialSize, + std::optional NewTableNumRootBits, + std::optional NewTableNumSubtrieBits) { + SmallString<128> PathStorage; + StringRef Path = PathTwine.toStringRef(PathStorage); + SmallString<128> TrieNameStorage; + StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage); + + constexpr size_t DefaultNumRootBits = 10; + constexpr size_t DefaultNumSubtrieBits = 6; + + size_t NumRootBits; + if (Error E = checkParameter( + "root bits", HashMappedTrieHandle::MaxNumRootBits, + NewTableNumRootBits, DefaultNumRootBits, Path, TrieName) + .moveInto(NumRootBits)) + return std::move(E); + + size_t NumSubtrieBits; + if (Error E = checkParameter("subtrie bits", + HashMappedTrieHandle::MaxNumSubtrieBits, + NewTableNumSubtrieBits, DefaultNumSubtrieBits, + Path, TrieName) + .moveInto(NumSubtrieBits)) + return std::move(E); + + size_t NumHashBytes = NumHashBits >> 3; + if (Error E = + checkParameter("hash size", HashMappedTrieHandle::MaxNumHashBits, + NumHashBits, std::nullopt, Path, TrieName) + .takeError()) + return std::move(E); + assert(NumHashBits == NumHashBytes << 3 && + "Expected hash size to be byte-aligned"); + if (NumHashBits != NumHashBytes << 3) + return createTableConfigError( + std::errc::argument_out_of_domain, Path, TrieName, + "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)"); + + // Constructor for if the file doesn't exist. + auto NewDBConstructor = [&](DatabaseFile &DB) -> Error { + HashMappedTrieHandle Trie = + HashMappedTrieHandle::create(DB.getAlloc(), TrieName, NumRootBits, + NumSubtrieBits, NumHashBits, DataSize); + DB.addTable(Trie); + return Error::success(); + }; + + // Get or create the file. + Expected File = + DatabaseFile::create(Path, MaxFileSize, NewDBConstructor); + if (!File) + return File.takeError(); + + // Find the trie and validate it. + // + // TODO: Add support for creating/adding a table to an existing file. + std::optional Table = File->findTable(TrieName); + if (!Table) + return createTableConfigError(std::errc::argument_out_of_domain, Path, + TrieName, "table not found"); + if (Error E = checkTable("table kind", (size_t)HashMappedTrieHandle::Kind, + (size_t)Table->getHeader().Kind, Path, TrieName)) + return std::move(E); + auto Trie = Table->cast(); + assert(Trie && "Already checked the kind"); + + // Check the hash and data size. + if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(), + Path, TrieName)) + return std::move(E); + if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(), + Path, TrieName)) + return std::move(E); + + // No flags supported right now. Either corrupt, or coming from a future + // writer. + if (size_t Flags = Trie.getFlags()) + return createTableConfigError(std::errc::invalid_argument, Path, TrieName, + "unsupported flags: " + Twine(Flags)); + + // Success. + OnDiskHashMappedTrie::ImplType Impl{DatabaseFile(std::move(*File)), Trie}; + return OnDiskHashMappedTrie(std::make_unique(std::move(Impl))); +} + +//===----------------------------------------------------------------------===// +// DataAllocator data structures. +//===----------------------------------------------------------------------===// + +namespace { +/// DataAllocator table layout: +/// - [8-bytes: Generic table header] +/// - 8-bytes: AllocatorOffset (reserved for implementing free lists) +/// - 8-bytes: Size for user data header +/// - +/// +/// Record layout: +/// - +class DataAllocatorHandle { +public: + static constexpr TableHandle::TableKind Kind = + TableHandle::TableKind::DataAllocator; + + struct Header { + TableHandle::Header GenericHeader; + std::atomic AllocatorOffset; + const uint64_t UserHeaderSize; + }; + + operator TableHandle() const { + if (!H) + return TableHandle(); + return TableHandle(*Region, H->GenericHeader); + } + + MutableArrayRef allocate(MappedFileRegionBumpPtr &Alloc, + size_t DataSize) { + assert(&Alloc.getRegion() == Region); + return MutableArrayRef(Alloc.allocate(DataSize), DataSize); + } + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + MappedFileRegion &getRegion() const { return *Region; } + + MutableArrayRef getUserHeader() { + return MutableArrayRef(reinterpret_cast(H + 1), + H->UserHeaderSize); + } + + static DataAllocatorHandle create(MappedFileRegionBumpPtr &Alloc, + StringRef Name, uint32_t UserHeaderSize); + + DataAllocatorHandle() = default; + DataAllocatorHandle(MappedFileRegion &Region, Header &H) + : Region(&Region), H(&H) {} + DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset) + : DataAllocatorHandle( + Region, *reinterpret_cast
(Region.data() + HeaderOffset)) { + } + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; +}; + +} // end anonymous namespace + +struct OnDiskDataAllocator::ImplType { + DatabaseFile File; + DataAllocatorHandle Store; +}; + +DataAllocatorHandle DataAllocatorHandle::create(MappedFileRegionBumpPtr &Alloc, + StringRef Name, + uint32_t UserHeaderSize) { + // Allocate. + intptr_t Offset = + Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1); + + // Construct the header and the name. + assert(Name.size() <= UINT16_MAX && "Expected smaller table name"); + auto *H = new (Alloc.getRegion().data() + Offset) + Header{{TableHandle::TableKind::DataAllocator, (uint16_t)Name.size(), + (int32_t)(sizeof(Header) + UserHeaderSize)}, + /*AllocatorOffset=*/{0}, + /*UserHeaderSize=*/UserHeaderSize}; + memset(H + 1, 0, UserHeaderSize); + char *NameStorage = reinterpret_cast(H + 1) + UserHeaderSize; + llvm::copy(Name, NameStorage); + NameStorage[Name.size()] = 0; + return DataAllocatorHandle(Alloc.getRegion(), *H); +} + +Expected OnDiskDataAllocator::create( + const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize, + std::optional NewFileInitialSize, uint32_t UserHeaderSize, + function_ref UserHeaderInit) { + assert(!UserHeaderSize || UserHeaderInit); + SmallString<128> PathStorage; + StringRef Path = PathTwine.toStringRef(PathStorage); + SmallString<128> TableNameStorage; + StringRef TableName = TableNameTwine.toStringRef(TableNameStorage); + + // Constructor for if the file doesn't exist. + auto NewDBConstructor = [&](DatabaseFile &DB) -> Error { + DataAllocatorHandle Store = + DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize); + DB.addTable(Store); + if (UserHeaderSize) + UserHeaderInit(Store.getUserHeader().data()); + return Error::success(); + }; + + // Get or create the file. + Expected File = + DatabaseFile::create(Path, MaxFileSize, NewDBConstructor); + if (!File) + return File.takeError(); + + // Find the table and validate it. + // + // TODO: Add support for creating/adding a table to an existing file. + std::optional Table = File->findTable(TableName); + if (!Table) + return createTableConfigError(std::errc::argument_out_of_domain, Path, + TableName, "table not found"); + if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind, + (size_t)Table->getHeader().Kind, Path, TableName)) + return std::move(E); + auto Store = Table->cast(); + assert(Store && "Already checked the kind"); + + // Success. + OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store}; + return OnDiskDataAllocator(std::make_unique(std::move(Impl))); +} + +OnDiskDataAllocator::pointer OnDiskDataAllocator::allocate(size_t Size) { + MutableArrayRef Data = + Impl->Store.allocate(Impl->File.getAlloc(), Size); + return pointer(FileOffset(Data.data() - Impl->Store.getRegion().data()), + Data); +} + +const char *OnDiskDataAllocator::beginData(FileOffset Offset) const { + assert(Offset); + assert(Impl); + assert(Offset.get() < (int64_t)Impl->File.getAlloc().size()); + return Impl->File.getRegion().data() + Offset.get(); +} + +MutableArrayRef OnDiskDataAllocator::getUserHeader() { + return Impl->Store.getUserHeader(); +} + +size_t OnDiskDataAllocator::size() const { return Impl->File.size(); } + +OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr Impl) + : Impl(std::move(Impl)) {} + +#else // !LLVM_ENABLE_ONDISK_CAS + +struct OnDiskHashMappedTrie::ImplType {}; + +Expected +OnDiskHashMappedTrie::create(const Twine &PathTwine, const Twine &TrieNameTwine, + size_t NumHashBits, uint64_t DataSize, + uint64_t MaxFileSize, + std::optional NewFileInitialSize, + std::optional NewTableNumRootBits, + std::optional NewTableNumSubtrieBits) { + report_fatal_error("not supported"); +} + +OnDiskHashMappedTrie::pointer +OnDiskHashMappedTrie::insertLazy(const_pointer Hint, ArrayRef Hash, + LazyInsertOnConstructCB OnConstruct, + LazyInsertOnLeakCB OnLeak) { + report_fatal_error("not supported"); +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::recoverFromFileOffset(FileOffset Offset) const { + report_fatal_error("not supported"); +} + +OnDiskHashMappedTrie::const_pointer +OnDiskHashMappedTrie::find(ArrayRef Hash) const { + report_fatal_error("not supported"); +} + +void OnDiskHashMappedTrie::print( + raw_ostream &OS, function_ref)> PrintRecordData) const { + report_fatal_error("not supported"); +} + +size_t OnDiskHashMappedTrie::size() const { + report_fatal_error("not supported"); +} + +struct OnDiskDataAllocator::ImplType {}; + +Expected OnDiskDataAllocator::create( + const Twine &Path, const Twine &TableName, uint64_t MaxFileSize, + std::optional NewFileInitialSize, uint32_t UserHeaderSize, + function_ref UserHeaderInit) { + report_fatal_error("not supported"); +} + +OnDiskDataAllocator::pointer OnDiskDataAllocator::allocate(size_t Size) { + report_fatal_error("not supported"); +} + +const char *OnDiskDataAllocator::beginData(FileOffset Offset) const { + report_fatal_error("not supported"); +} + +MutableArrayRef OnDiskDataAllocator::getUserHeader() { + report_fatal_error("not supported"); +} + +size_t OnDiskDataAllocator::size() const { + report_fatal_error("not supported"); +} + +#endif // LLVM_ENABLE_ONDISK_CAS + +OnDiskHashMappedTrie::OnDiskHashMappedTrie(std::unique_ptr Impl) + : Impl(std::move(Impl)) {} +OnDiskHashMappedTrie::OnDiskHashMappedTrie(OnDiskHashMappedTrie &&RHS) = + default; +OnDiskHashMappedTrie & +OnDiskHashMappedTrie::operator=(OnDiskHashMappedTrie &&RHS) = default; +OnDiskHashMappedTrie::~OnDiskHashMappedTrie() = default; + +OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default; +OnDiskDataAllocator & +OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default; +OnDiskDataAllocator::~OnDiskDataAllocator() = default; diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp new file mode 100644 index 00000000000000..b5d24a93e218be --- /dev/null +++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp @@ -0,0 +1,78 @@ +//===- OnDiskKeyValueDB.cpp -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Path.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +static constexpr StringLiteral ActionCacheFile = "actions"; +static constexpr StringLiteral FilePrefix = "v3."; + +Expected> OnDiskKeyValueDB::put(ArrayRef Key, + ArrayRef Value) { + if (LLVM_UNLIKELY(Value.size() != ValueSize)) + return createStringError(errc::invalid_argument, + "expected value size of " + itostr(ValueSize) + + ", got: " + itostr(Value.size())); + assert(Value.size() == ValueSize); + OnDiskHashMappedTrie::pointer ActionP = Cache.insertLazy( + Key, [&](FileOffset TentativeOffset, + OnDiskHashMappedTrie::ValueProxy TentativeValue) { + assert(TentativeValue.Data.size() == ValueSize); + llvm::copy(Value, TentativeValue.Data.data()); + }); + return ActionP->Data; +} + +Expected>> +OnDiskKeyValueDB::get(ArrayRef Key) { + // Check the result cache. + OnDiskHashMappedTrie::const_pointer ActionP = Cache.find(Key); + if (!ActionP) + return std::nullopt; + assert(isAddrAligned(Align(8), ActionP->Data.data())); + return ActionP->Data; +} + +Expected> +OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, + StringRef ValueName, size_t ValueSize) { + if (std::error_code EC = sys::fs::create_directories(Path)) + return createFileError(Path, EC); + + SmallString<256> CachePath(Path); + sys::path::append(CachePath, FilePrefix + ActionCacheFile); + constexpr uint64_t MB = 1024ull * 1024ull; + constexpr uint64_t GB = 1024ull * 1024ull * 1024ull; + + uint64_t MaxFileSize = GB; + auto CustomSize = getOverriddenMaxMappingSize(); + if (!CustomSize) + return CustomSize.takeError(); + if (*CustomSize) + MaxFileSize = **CustomSize; + + std::optional ActionCache; + if (Error E = OnDiskHashMappedTrie::create( + CachePath, + "llvm.actioncache[" + HashName + "->" + ValueName + "]", + KeySize * 8, + /*DataSize=*/ValueSize, MaxFileSize, /*MinFileSize=*/MB) + .moveInto(ActionCache)) + return std::move(E); + + return std::unique_ptr( + new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache))); +} diff --git a/llvm/lib/CAS/PluginAPI.h b/llvm/lib/CAS/PluginAPI.h new file mode 100644 index 00000000000000..b9505677e6c11c --- /dev/null +++ b/llvm/lib/CAS/PluginAPI.h @@ -0,0 +1,97 @@ +//===- PluginAPI.h ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CAS_PLUGINAPI_H +#define LLVM_LIB_CAS_PLUGINAPI_H + +#include "llvm-c/CAS/PluginAPI_types.h" + +/// See documentation in \c "llvm-c/CAS/PluginAPI_functions.h" for how these +/// functions are used. +struct llcas_functions_t { + void (*get_plugin_version)(unsigned *major, unsigned *minor); + + void (*string_dispose)(char *); + + llcas_cas_options_t (*cas_options_create)(void); + + void (*cas_options_dispose)(llcas_cas_options_t); + + void (*cas_options_set_client_version)(llcas_cas_options_t, unsigned major, + unsigned minor); + + void (*cas_options_set_ondisk_path)(llcas_cas_options_t, const char *path); + + bool (*cas_options_set_option)(llcas_cas_options_t, const char *name, + const char *value, char **error); + + llcas_cas_t (*cas_create)(llcas_cas_options_t, char **error); + + void (*cas_dispose)(llcas_cas_t); + + unsigned (*digest_parse)(llcas_cas_t, const char *printed_digest, + uint8_t *bytes, size_t bytes_size, char **error); + + bool (*digest_print)(llcas_cas_t, llcas_digest_t, char **printed_id, + char **error); + + char *(*cas_get_hash_schema_name)(llcas_cas_t); + + bool (*cas_get_objectid)(llcas_cas_t, llcas_digest_t, llcas_objectid_t *, + char **error); + + llcas_digest_t (*objectid_get_digest)(llcas_cas_t, llcas_objectid_t); + + llcas_lookup_result_t (*cas_contains_object)(llcas_cas_t, llcas_objectid_t, + bool globally, char **error); + + llcas_lookup_result_t (*cas_load_object)(llcas_cas_t, llcas_objectid_t, + llcas_loaded_object_t *, + char **error); + void (*cas_load_object_async)(llcas_cas_t, llcas_objectid_t, void *ctx_cb, + llcas_cas_load_object_cb); + + bool (*cas_store_object)(llcas_cas_t, llcas_data_t, + const llcas_objectid_t *refs, size_t refs_count, + llcas_objectid_t *, char **error); + + llcas_data_t (*loaded_object_get_data)(llcas_cas_t, llcas_loaded_object_t); + + llcas_object_refs_t (*loaded_object_get_refs)(llcas_cas_t, + llcas_loaded_object_t); + + size_t (*object_refs_get_count)(llcas_cas_t, llcas_object_refs_t); + + llcas_objectid_t (*object_refs_get_id)(llcas_cas_t, llcas_object_refs_t, + size_t index); + + /*===--------------------------------------------------------------------===*\ + |* Action cache API + \*===--------------------------------------------------------------------===*/ + + llcas_lookup_result_t (*actioncache_get_for_digest)(llcas_cas_t, + llcas_digest_t key, + llcas_objectid_t *p_value, + bool globally, + char **error); + + void (*actioncache_get_for_digest_async)(llcas_cas_t, llcas_digest_t key, + bool globally, void *ctx_cb, + llcas_actioncache_get_cb); + + bool (*actioncache_put_for_digest)(llcas_cas_t, llcas_digest_t key, + llcas_objectid_t value, bool globally, + char **error); + + void (*actioncache_put_for_digest_async)(llcas_cas_t, llcas_digest_t key, + llcas_objectid_t value, + bool globally, void *ctx_cb, + llcas_actioncache_put_cb); +}; + +#endif // LLVM_LIB_CAS_PLUGINAPI_H diff --git a/llvm/lib/CAS/PluginAPI_functions.def b/llvm/lib/CAS/PluginAPI_functions.def new file mode 100644 index 00000000000000..17e60e510b2fcf --- /dev/null +++ b/llvm/lib/CAS/PluginAPI_functions.def @@ -0,0 +1,31 @@ + +// Format is Name/required. If 'required' is true then loading will fail if the +// symbol is missing, otherwise loading will continue and the function pointer +// will be null. Order is lexicographically by name. + +CASPLUGINAPI_FUNCTION(actioncache_get_for_digest, true) +CASPLUGINAPI_FUNCTION(actioncache_get_for_digest_async, true) +CASPLUGINAPI_FUNCTION(actioncache_put_for_digest, true) +CASPLUGINAPI_FUNCTION(actioncache_put_for_digest_async, true) +CASPLUGINAPI_FUNCTION(cas_contains_object, true) +CASPLUGINAPI_FUNCTION(cas_create, true) +CASPLUGINAPI_FUNCTION(cas_dispose, true) +CASPLUGINAPI_FUNCTION(cas_get_hash_schema_name, true) +CASPLUGINAPI_FUNCTION(cas_get_objectid, true) +CASPLUGINAPI_FUNCTION(cas_load_object, true) +CASPLUGINAPI_FUNCTION(cas_load_object_async, true) +CASPLUGINAPI_FUNCTION(cas_options_create, true) +CASPLUGINAPI_FUNCTION(cas_options_dispose, true) +CASPLUGINAPI_FUNCTION(cas_options_set_client_version, true) +CASPLUGINAPI_FUNCTION(cas_options_set_ondisk_path, true) +CASPLUGINAPI_FUNCTION(cas_options_set_option, true) +CASPLUGINAPI_FUNCTION(cas_store_object, true) +CASPLUGINAPI_FUNCTION(digest_parse, true) +CASPLUGINAPI_FUNCTION(digest_print, true) +CASPLUGINAPI_FUNCTION(get_plugin_version, true) +CASPLUGINAPI_FUNCTION(loaded_object_get_data, true) +CASPLUGINAPI_FUNCTION(loaded_object_get_refs, true) +CASPLUGINAPI_FUNCTION(object_refs_get_count, true) +CASPLUGINAPI_FUNCTION(object_refs_get_id, true) +CASPLUGINAPI_FUNCTION(objectid_get_digest, true) +CASPLUGINAPI_FUNCTION(string_dispose, true) diff --git a/llvm/lib/CAS/PluginCAS.cpp b/llvm/lib/CAS/PluginCAS.cpp new file mode 100644 index 00000000000000..333180c61996ee --- /dev/null +++ b/llvm/lib/CAS/PluginCAS.cpp @@ -0,0 +1,523 @@ +//===- PluginCAS.cpp --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/PluginCAS.h" +#include "PluginAPI.h" +#include "llvm/ADT/FunctionExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/DynamicLibrary.h" + +using namespace llvm; +using namespace llvm::cas; + +namespace { + +class PluginCASContext : public CASContext { +public: + void printIDImpl(raw_ostream &OS, const CASID &ID) const final; + + StringRef getHashSchemaIdentifier() const final { return SchemaName; } + + static Expected> + create(StringRef PluginPath, StringRef OnDiskPath, + ArrayRef> PluginArgs); + + ~PluginCASContext() { Functions.cas_dispose(c_cas); } + + llcas_functions_t Functions{}; + llcas_cas_t c_cas = nullptr; + std::string SchemaName; + + static Error errorAndDispose(char *c_err, const llcas_functions_t &Funcs) { + Error E = createStringError(inconvertibleErrorCode(), c_err); + Funcs.string_dispose(c_err); + return E; + } + + Error errorAndDispose(char *c_err) const { + return errorAndDispose(c_err, Functions); + } +}; + +} // anonymous namespace + +void PluginCASContext::printIDImpl(raw_ostream &OS, const CASID &ID) const { + ArrayRef Hash = ID.getHash(); + char *c_printed_id = nullptr; + char *c_err = nullptr; + if (Functions.digest_print(c_cas, llcas_digest_t{Hash.data(), Hash.size()}, + &c_printed_id, &c_err)) + report_fatal_error(errorAndDispose(c_err)); + OS << c_printed_id; + Functions.string_dispose(c_printed_id); +} + +Expected> PluginCASContext::create( + StringRef PluginPath, StringRef OnDiskPath, + ArrayRef> PluginArgs) { + auto reportError = [PluginPath](const Twine &Description) -> Error { + std::error_code EC = inconvertibleErrorCode(); + return createStringError(EC, "error loading '" + PluginPath + + "': " + Description); + }; + + SmallString<256> PathBuf = PluginPath; + std::string ErrMsg; + sys::DynamicLibrary Lib = + sys::DynamicLibrary::getPermanentLibrary(PathBuf.c_str(), &ErrMsg); + if (!Lib.isValid()) + return reportError(ErrMsg); + + llcas_functions_t Functions{}; + +#define CASPLUGINAPI_FUNCTION(name, required) \ + if (!(Functions.name = (decltype(llcas_functions_t::name)) \ + Lib.getAddressOfSymbol("llcas_" #name))) { \ + if (required) \ + return reportError("failed symbol 'llcas_" #name "' lookup"); \ + } +#include "PluginAPI_functions.def" +#undef CASPLUGINAPI_FUNCTION + + llcas_cas_options_t c_opts = Functions.cas_options_create(); + auto _ = make_scope_exit([&]() { Functions.cas_options_dispose(c_opts); }); + + Functions.cas_options_set_client_version(c_opts, LLCAS_VERSION_MAJOR, + LLCAS_VERSION_MINOR); + SmallString<256> OnDiskPathBuf = OnDiskPath; + Functions.cas_options_set_ondisk_path(c_opts, OnDiskPathBuf.c_str()); + for (const auto &Pair : PluginArgs) { + char *c_err = nullptr; + if (Functions.cas_options_set_option(c_opts, Pair.first.c_str(), + Pair.second.c_str(), &c_err)) + return errorAndDispose(c_err, Functions); + } + + char *c_err = nullptr; + llcas_cas_t c_cas = Functions.cas_create(c_opts, &c_err); + if (!c_cas) + return errorAndDispose(c_err, Functions); + + char *c_schema = Functions.cas_get_hash_schema_name(c_cas); + std::string SchemaName = c_schema; + Functions.string_dispose(c_schema); + + auto Ctx = std::make_shared(); + Ctx->Functions = Functions; + Ctx->c_cas = c_cas; + Ctx->SchemaName = std::move(SchemaName); + return Ctx; +} + +//===----------------------------------------------------------------------===// +// ObjectStore API +//===----------------------------------------------------------------------===// + +namespace { + +class PluginObjectStore + : public ObjectStore, + public std::enable_shared_from_this { +public: + Expected parseID(StringRef ID) final; + Expected store(ArrayRef Refs, + ArrayRef Data) final; + CASID getID(ObjectRef Ref) const final; + std::optional getReference(const CASID &ID) const final; + Expected isMaterialized(ObjectRef Ref) const final; + Expected> loadIfExists(ObjectRef Ref) final; + void + loadIfExistsAsync(ObjectRef Ref, + unique_function>)> + Callback) final; + uint64_t getDataSize(ObjectHandle Node) const final; + Error forEachRef(ObjectHandle Node, + function_ref Callback) const final; + ObjectRef readRef(ObjectHandle Node, size_t I) const final; + size_t getNumRefs(ObjectHandle Node) const final; + ArrayRef getData(ObjectHandle Node, + bool RequiresNullTerminator = false) const final; + Error validate(const CASID &ID) final { + // Not supported yet. Always return success. + return Error::success(); + } + + PluginObjectStore(std::shared_ptr); + + std::shared_ptr Ctx; +}; + +} // anonymous namespace + +Expected PluginObjectStore::parseID(StringRef ID) { + // Use big enough stack so that we don't have to allocate in the heap. + SmallString<148> IDBuf(ID); + SmallVector BytesBuf(68); + + auto parseDigest = [&]() -> Expected { + char *c_err = nullptr; + unsigned NumBytes = Ctx->Functions.digest_parse( + Ctx->c_cas, IDBuf.c_str(), BytesBuf.data(), BytesBuf.size(), &c_err); + if (NumBytes == 0) + return Ctx->errorAndDispose(c_err); + return NumBytes; + }; + + Expected NumBytes = parseDigest(); + if (!NumBytes) + return NumBytes.takeError(); + + if (*NumBytes > BytesBuf.size()) { + BytesBuf.resize(*NumBytes); + NumBytes = parseDigest(); + if (!NumBytes) + return NumBytes.takeError(); + assert(*NumBytes == BytesBuf.size()); + } else { + BytesBuf.truncate(*NumBytes); + } + + return CASID::create(Ctx.get(), toStringRef(BytesBuf)); +} + +Expected PluginObjectStore::store(ArrayRef Refs, + ArrayRef Data) { + SmallVector c_ids; + c_ids.reserve(Refs.size()); + for (ObjectRef Ref : Refs) { + c_ids.push_back(llcas_objectid_t{Ref.getInternalRef(*this)}); + } + + llcas_objectid_t c_stored_id; + char *c_err = nullptr; + if (Ctx->Functions.cas_store_object( + Ctx->c_cas, llcas_data_t{Data.data(), Data.size()}, c_ids.data(), + c_ids.size(), &c_stored_id, &c_err)) + return Ctx->errorAndDispose(c_err); + + return ObjectRef::getFromInternalRef(*this, c_stored_id.opaque); +} + +static StringRef toStringRef(llcas_digest_t c_digest) { + return StringRef((const char *)c_digest.data, c_digest.size); +} + +CASID PluginObjectStore::getID(ObjectRef Ref) const { + llcas_objectid_t c_id{Ref.getInternalRef(*this)}; + llcas_digest_t c_digest = + Ctx->Functions.objectid_get_digest(Ctx->c_cas, c_id); + return CASID::create(Ctx.get(), toStringRef(c_digest)); +} + +std::optional +PluginObjectStore::getReference(const CASID &ID) const { + ArrayRef Hash = ID.getHash(); + llcas_objectid_t c_id; + char *c_err = nullptr; + if (Ctx->Functions.cas_get_objectid( + Ctx->c_cas, llcas_digest_t{Hash.data(), Hash.size()}, &c_id, &c_err)) + report_fatal_error(Ctx->errorAndDispose(c_err)); + + return ObjectRef::getFromInternalRef(*this, c_id.opaque); +} + +Expected PluginObjectStore::isMaterialized(ObjectRef Ref) const { + llcas_objectid_t c_id{Ref.getInternalRef(*this)}; + char *c_err = nullptr; + llcas_lookup_result_t c_result = Ctx->Functions.cas_contains_object( + Ctx->c_cas, c_id, /*globally=*/false, &c_err); + switch (c_result) { + case LLCAS_LOOKUP_RESULT_SUCCESS: + return true; + case LLCAS_LOOKUP_RESULT_NOTFOUND: + return false; + case LLCAS_LOOKUP_RESULT_ERROR: + return Ctx->errorAndDispose(c_err); + } +} + +Expected> +PluginObjectStore::loadIfExists(ObjectRef Ref) { + llcas_objectid_t c_id{Ref.getInternalRef(*this)}; + llcas_loaded_object_t c_obj; + char *c_err = nullptr; + llcas_lookup_result_t c_result = + Ctx->Functions.cas_load_object(Ctx->c_cas, c_id, &c_obj, &c_err); + switch (c_result) { + case LLCAS_LOOKUP_RESULT_SUCCESS: + return makeObjectHandle(c_obj.opaque); + case LLCAS_LOOKUP_RESULT_NOTFOUND: + return std::nullopt; + case LLCAS_LOOKUP_RESULT_ERROR: + return Ctx->errorAndDispose(c_err); + } +} + +void PluginObjectStore::loadIfExistsAsync( + ObjectRef Ref, + unique_function>)> Callback) { + llcas_objectid_t c_id{Ref.getInternalRef(*this)}; + + struct LoadObjCtx { + std::shared_ptr CAS; + unique_function>)> Callback; + + LoadObjCtx( + std::shared_ptr CAS, + unique_function>)> Callback) + : CAS(std::move(CAS)), Callback(std::move(Callback)) {} + }; + auto LoadObjCB = [](void *c_ctx, llcas_lookup_result_t c_result, + llcas_loaded_object_t c_obj, char *c_err) { + auto getObjAndDispose = + [&](LoadObjCtx *Ctx) -> Expected> { + auto _ = make_scope_exit([Ctx]() { delete Ctx; }); + switch (c_result) { + case LLCAS_LOOKUP_RESULT_SUCCESS: + return Ctx->CAS->makeObjectHandle(c_obj.opaque); + case LLCAS_LOOKUP_RESULT_NOTFOUND: + return std::nullopt; + case LLCAS_LOOKUP_RESULT_ERROR: + return Ctx->CAS->Ctx->errorAndDispose(c_err); + } + }; + + LoadObjCtx *Ctx = static_cast(c_ctx); + auto Callback = std::move(Ctx->Callback); + Callback(getObjAndDispose(Ctx)); + }; + + LoadObjCtx *CallCtx = new LoadObjCtx(shared_from_this(), std::move(Callback)); + Ctx->Functions.cas_load_object_async(Ctx->c_cas, c_id, CallCtx, LoadObjCB); +} + +namespace { + +class ObjectRefsWrapper { +public: + ObjectRefsWrapper(const ObjectHandle &Node, const PluginObjectStore &Store) + : Store(Store), Ctx(*Store.Ctx) { + llcas_loaded_object_t c_obj{Node.getInternalRef(Store)}; + this->c_refs = Ctx.Functions.loaded_object_get_refs(Ctx.c_cas, c_obj); + } + + size_t size() const { + return Ctx.Functions.object_refs_get_count(Ctx.c_cas, c_refs); + } + + ObjectRef operator[](size_t I) const { + llcas_objectid_t c_id = + Ctx.Functions.object_refs_get_id(Ctx.c_cas, c_refs, I); + return ObjectRef::getFromInternalRef(Store, c_id.opaque); + } + +private: + const PluginObjectStore &Store; + PluginCASContext &Ctx; + llcas_object_refs_t c_refs; +}; + +} // namespace + +// FIXME: Replace forEachRef/readRef/getNumRefs APIs with an iterator interface. +Error PluginObjectStore::forEachRef( + ObjectHandle Node, function_ref Callback) const { + ObjectRefsWrapper Refs(Node, *this); + for (unsigned I = 0, E = Refs.size(); I != E; ++I) { + if (Error E = Callback(Refs[I])) + return E; + } + return Error::success(); +} + +ObjectRef PluginObjectStore::readRef(ObjectHandle Node, size_t I) const { + ObjectRefsWrapper Refs(Node, *this); + return Refs[I]; +} + +size_t PluginObjectStore::getNumRefs(ObjectHandle Node) const { + ObjectRefsWrapper Refs(Node, *this); + return Refs.size(); +} + +// FIXME: Remove getDataSize(ObjectHandle) from API requirement, +// \c getData(ObjectHandle) should be enough. +uint64_t PluginObjectStore::getDataSize(ObjectHandle Node) const { + ArrayRef Data = getData(Node); + return Data.size(); +} + +ArrayRef PluginObjectStore::getData(ObjectHandle Node, + bool RequiresNullTerminator) const { + // FIXME: Remove RequiresNullTerminator from ObjectStore API requirement? + // It is a requirement for the plugin API. + llcas_data_t c_data = Ctx->Functions.loaded_object_get_data( + Ctx->c_cas, llcas_loaded_object_t{Node.getInternalRef(*this)}); + return ArrayRef((const char *)c_data.data, c_data.size); +} + +PluginObjectStore::PluginObjectStore(std::shared_ptr CASCtx) + : ObjectStore(*CASCtx), Ctx(std::move(CASCtx)) {} + +//===----------------------------------------------------------------------===// +// ActionCache API +//===----------------------------------------------------------------------===// + +namespace { + +class PluginActionCache : public ActionCache { +public: + Expected> getImpl(ArrayRef ResolvedKey, + bool Globally) const final; + void getImplAsync(ArrayRef ResolvedKey, bool Globally, + unique_function>)> + Callback) const final; + + Error putImpl(ArrayRef ResolvedKey, const CASID &Result, + bool Globally) final; + void putImplAsync(ArrayRef ResolvedKey, const CASID &Result, + bool Globally, unique_function Callback) final; + + PluginActionCache(std::shared_ptr); + +private: + std::shared_ptr Ctx; +}; + +} // anonymous namespace + +Expected> +PluginActionCache::getImpl(ArrayRef ResolvedKey, bool Globally) const { + llcas_objectid_t c_value; + char *c_err = nullptr; + llcas_lookup_result_t c_result = Ctx->Functions.actioncache_get_for_digest( + Ctx->c_cas, llcas_digest_t{ResolvedKey.data(), ResolvedKey.size()}, + &c_value, Globally, &c_err); + switch (c_result) { + case LLCAS_LOOKUP_RESULT_SUCCESS: { + llcas_digest_t c_digest = + Ctx->Functions.objectid_get_digest(Ctx->c_cas, c_value); + return CASID::create(Ctx.get(), toStringRef(c_digest)); + } + case LLCAS_LOOKUP_RESULT_NOTFOUND: + return std::nullopt; + case LLCAS_LOOKUP_RESULT_ERROR: + return Ctx->errorAndDispose(c_err); + } +} + +void PluginActionCache::getImplAsync( + ArrayRef ResolvedKey, bool Globally, + unique_function>)> Callback) const { + + struct CacheGetCtx { + std::shared_ptr CASCtx; + unique_function>)> Callback; + }; + auto CacheGetCB = [](void *c_ctx, llcas_lookup_result_t c_result, + llcas_objectid_t c_value, char *c_err) { + auto getValueAndDispose = + [&](CacheGetCtx *Ctx) -> Expected> { + auto _ = make_scope_exit([Ctx]() { delete Ctx; }); + switch (c_result) { + case LLCAS_LOOKUP_RESULT_SUCCESS: { + llcas_digest_t c_digest = Ctx->CASCtx->Functions.objectid_get_digest( + Ctx->CASCtx->c_cas, c_value); + return CASID::create(Ctx->CASCtx.get(), toStringRef(c_digest)); + } + case LLCAS_LOOKUP_RESULT_NOTFOUND: + return std::nullopt; + case LLCAS_LOOKUP_RESULT_ERROR: + return Ctx->CASCtx->errorAndDispose(c_err); + } + }; + + CacheGetCtx *Ctx = static_cast(c_ctx); + auto Callback = std::move(Ctx->Callback); + Callback(getValueAndDispose(Ctx)); + }; + + CacheGetCtx *CallCtx = new CacheGetCtx{this->Ctx, std::move(Callback)}; + Ctx->Functions.actioncache_get_for_digest_async( + Ctx->c_cas, llcas_digest_t{ResolvedKey.data(), ResolvedKey.size()}, + Globally, CallCtx, CacheGetCB); +} + +Error PluginActionCache::putImpl(ArrayRef ResolvedKey, + const CASID &Result, bool Globally) { + ArrayRef Hash = Result.getHash(); + llcas_objectid_t c_value; + char *c_err = nullptr; + if (Ctx->Functions.cas_get_objectid(Ctx->c_cas, + llcas_digest_t{Hash.data(), Hash.size()}, + &c_value, &c_err)) + return Ctx->errorAndDispose(c_err); + + if (Ctx->Functions.actioncache_put_for_digest( + Ctx->c_cas, llcas_digest_t{ResolvedKey.data(), ResolvedKey.size()}, + c_value, Globally, &c_err)) + return Ctx->errorAndDispose(c_err); + + return Error::success(); +} + +void PluginActionCache::putImplAsync(ArrayRef ResolvedKey, + const CASID &Result, bool Globally, + unique_function Callback) { + ArrayRef Hash = Result.getHash(); + llcas_objectid_t c_value; + char *c_err = nullptr; + if (Ctx->Functions.cas_get_objectid(Ctx->c_cas, + llcas_digest_t{Hash.data(), Hash.size()}, + &c_value, &c_err)) + return Callback(Ctx->errorAndDispose(c_err)); + + struct CachePutCtx { + std::shared_ptr CASCtx; + unique_function Callback; + }; + auto CachePutCB = [](void *c_ctx, bool failed, char *c_err) { + auto checkForErrorAndDispose = [&](CachePutCtx *Ctx) -> Error { + auto _ = make_scope_exit([Ctx]() { delete Ctx; }); + if (failed) + return Ctx->CASCtx->errorAndDispose(c_err); + return Error::success(); + }; + + CachePutCtx *Ctx = static_cast(c_ctx); + auto Callback = std::move(Ctx->Callback); + Callback(checkForErrorAndDispose(Ctx)); + }; + + CachePutCtx *CallCtx = new CachePutCtx{this->Ctx, std::move(Callback)}; + Ctx->Functions.actioncache_put_for_digest_async( + Ctx->c_cas, llcas_digest_t{ResolvedKey.data(), ResolvedKey.size()}, + c_value, Globally, CallCtx, CachePutCB); +} + +PluginActionCache::PluginActionCache(std::shared_ptr CASCtx) + : ActionCache(*CASCtx), Ctx(std::move(CASCtx)) {} + +//===----------------------------------------------------------------------===// +// createPluginCASDatabases API +//===----------------------------------------------------------------------===// + +Expected, std::shared_ptr>> +cas::createPluginCASDatabases( + StringRef PluginPath, StringRef OnDiskPath, + ArrayRef> PluginArgs) { + std::shared_ptr Ctx; + if (Error E = PluginCASContext::create(PluginPath, OnDiskPath, PluginArgs) + .moveInto(Ctx)) + return std::move(E); + auto CAS = std::make_shared(Ctx); + auto AC = std::make_shared(std::move(Ctx)); + return std::make_pair(std::move(CAS), std::move(AC)); +} diff --git a/llvm/lib/CAS/TreeEntry.cpp b/llvm/lib/CAS/TreeEntry.cpp new file mode 100644 index 00000000000000..712ae40be15d6f --- /dev/null +++ b/llvm/lib/CAS/TreeEntry.cpp @@ -0,0 +1,47 @@ +//===- Utils.cpp ------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/BinaryFormat/Magic.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/CAS/TreeSchema.h" +#include "llvm/Support/MemoryBufferRef.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/StringSaver.h" + +using namespace llvm; +using namespace llvm::cas; + +static void printTreeEntryKind(raw_ostream &OS, TreeEntry::EntryKind Kind) { + switch (Kind) { + case TreeEntry::Regular: + OS << "file"; + break; + case TreeEntry::Executable: + OS << "exec"; + break; + case TreeEntry::Symlink: + OS << "syml"; + break; + case TreeEntry::Tree: + OS << "tree"; + break; + } +} + +void cas::NamedTreeEntry::print(raw_ostream &OS, ObjectStore &CAS) const { + printTreeEntryKind(OS, getKind()); + OS << " " << CAS.getID(getRef()) << " " << Name; + if (getKind() == TreeEntry::Tree) + OS << "/"; + if (getKind() == TreeEntry::Symlink) { + ObjectProxy Target = cantFail(CAS.getProxy(getRef())); + OS << " -> "; + OS << Target.getData(); + } + OS << "\n"; +} diff --git a/llvm/lib/CAS/TreeSchema.cpp b/llvm/lib/CAS/TreeSchema.cpp new file mode 100644 index 00000000000000..bb2ff7d6ec3bf3 --- /dev/null +++ b/llvm/lib/CAS/TreeSchema.cpp @@ -0,0 +1,231 @@ +//===- TreeSchema.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/TreeSchema.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/MemoryBufferRef.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/StringSaver.h" + +using namespace llvm; +using namespace llvm::cas; + +char TreeSchema::ID = 0; +constexpr StringLiteral TreeSchema::SchemaName; + +void TreeSchema::anchor() {} + +bool TreeSchema::isNode(const ObjectProxy &Node) const { + // Load the first ref to check its content. + if (Node.getNumReferences() < 1) + return false; + + // If can't load the first ref, consume error and return false. + auto FirstRef = Node.getReference(0); + return FirstRef == getKindRef(); +} + +TreeSchema::TreeSchema(cas::ObjectStore &CAS) : TreeSchema::RTTIExtends(CAS) { + TreeKindRef = cantFail(CAS.storeFromString(std::nullopt, SchemaName)); +} + +ObjectRef TreeSchema::getKindRef() const { return *TreeKindRef; } + +size_t TreeSchema::getNumTreeEntries(TreeProxy Tree) const { + return Tree.getNumReferences() - 1; +} + +Error TreeSchema::forEachTreeEntry( + TreeProxy Tree, + function_ref Callback) const { + for (size_t I = 0, IE = getNumTreeEntries(Tree); I != IE; ++I) + if (Error E = Callback(loadTreeEntry(Tree, I))) + return E; + + return Error::success(); +} + +Error TreeSchema::walkFileTreeRecursively( + ObjectStore &CAS, ObjectRef Root, + function_ref)> + Callback) { + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + SmallString<128> PathStorage; + SmallVector Stack; + Stack.emplace_back(Root, TreeEntry::Tree, "/"); + + while (!Stack.empty()) { + if (Stack.back().getKind() != TreeEntry::Tree) { + if (Error E = Callback(Stack.pop_back_val(), std::nullopt)) + return E; + continue; + } + + NamedTreeEntry Parent = Stack.pop_back_val(); + Expected ExpTree = load(Parent.getRef()); + if (Error E = ExpTree.takeError()) + return E; + TreeProxy Tree = *ExpTree; + if (Error E = Callback(Parent, Tree)) + return E; + for (int I = Tree.size(), E = 0; I != E; --I) { + std::optional Child = Tree.get(I - 1); + assert(Child && "Expected no corruption"); + + PathStorage = Parent.getName(); + sys::path::append(PathStorage, sys::path::Style::posix, Child->getName()); + Stack.emplace_back(Child->getRef(), Child->getKind(), + Saver.save(StringRef(PathStorage))); + } + } + + return Error::success(); +} + +NamedTreeEntry TreeSchema::loadTreeEntry(TreeProxy Tree, size_t I) const { + // Load entry from TreeNode. + TreeEntry::EntryKind Kind = + (TreeEntry::EntryKind) + Tree.getData()[I + (Tree.size() + 1) * sizeof(uint32_t)]; + + StringRef Name = Tree.getName(I); + auto ObjectRef = Tree.getReference(I + 1); + + return {ObjectRef, Kind, Name}; +} + +std::optional TreeSchema::lookupTreeEntry(TreeProxy Tree, + StringRef Name) const { + size_t NumNames = Tree.size(); + if (!NumNames) + return std::nullopt; + + // Start with a binary search, if there are enough entries. + // + // FIXME: Should just use std::lower_bound, but we need the actual iterators + // to know the index in the NameCache... + const size_t MaxLinearSearchSize = 4; + size_t Last = NumNames; + size_t First = 0; + while (Last - First > MaxLinearSearchSize) { + auto I = First + (Last - First) / 2; + StringRef NameI = Tree.getName(I); + switch (Name.compare(NameI)) { + case 0: + return I; + case -1: + Last = I; + break; + case 1: + First = I + 1; + break; + } + } + + // Use a linear search for small trees. + for (; First != Last; ++First) + if (Name == Tree.getName(First)) + return First; + + return std::nullopt; +} + +Expected TreeSchema::load(ObjectRef Object) const { + auto TreeNode = CAS.getProxy(Object); + if (!TreeNode) + return TreeNode.takeError(); + + return load(*TreeNode); +} + +Expected TreeSchema::load(ObjectProxy Object) const { + if (!isNode(Object)) + return createStringError(inconvertibleErrorCode(), "not a tree object"); + + return TreeProxy::get(*this, Object); +} + +Expected TreeSchema::create(ArrayRef Entries) { + return TreeProxy::create(*this, Entries); +} + +Expected TreeProxy::get(const TreeSchema &Schema, + Expected Ref) { + if (!Ref) + return Ref.takeError(); + return TreeProxy(Schema, *Ref); +} + +Expected TreeProxy::create(TreeSchema &Schema, + ArrayRef Entries) { + auto B = Builder::startNode(Schema); + if (!B) + return B.takeError(); + + return B->build(Entries); +} + +StringRef TreeProxy::getName(size_t I) const { + uint32_t StartIdx = + support::endian::read32le(getData().data() + sizeof(uint32_t) * I); + uint32_t EndIdx = + support::endian::read32le(getData().data() + sizeof(uint32_t) * (I + 1)); + + return StringRef(getData().data() + StartIdx, EndIdx - StartIdx); +} + +Expected TreeProxy::Builder::startNode(TreeSchema &Schema) { + Builder B(Schema); + B.Refs.push_back(Schema.getKindRef()); + return std::move(B); +} + +Expected +TreeProxy::Builder::build(ArrayRef Entries) { + // Ensure a stable order for tree entries and ignore name collisions. + SmallVector Sorted(Entries.begin(), Entries.end()); + std::stable_sort(Sorted.begin(), Sorted.end()); + Sorted.erase(std::unique(Sorted.begin(), Sorted.end()), Sorted.end()); + + raw_svector_ostream OS(Data); + support::endian::Writer Writer(OS, endianness::little); + // Encode the entires in the Data. The layout of the tree schema object is: + // * Name offset table: The offset of in the data blob for where to find the + // string. It has N + 1 entries and you can find the name of n-th entry at + // offset[n] -> offset[n+1]. Each offset is encoded as little-endian + // uint32_t. + // * Kind: uint8_t for each entry. + // * Object: ObjectRef for each entry is at n + 1 refs for the object (with + // the first one being the tree kind ID). + + // Write Name. + // The start of the string table index. + uint32_t StrIdx = + sizeof(uint8_t) * Sorted.size() + sizeof(uint32_t) * (Sorted.size() + 1); + for (auto &Entry : Sorted) { + Writer.write(StrIdx); + StrIdx += Entry.getName().size(); + + // Append refs. + Refs.push_back(Entry.getRef()); + } + // Write the end index for the last string. + Writer.write(StrIdx); + + // Write Kind. + for (auto &Entry : Sorted) + Writer.write((uint8_t)Entry.getKind()); + + // Write names in the end of the block. + for (auto &Entry : Sorted) + OS << Entry.getName(); + + return TreeProxy::get(*Schema, Schema->CAS.createProxy(Refs, Data)); +} diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp new file mode 100644 index 00000000000000..a36dd49fb01fa7 --- /dev/null +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -0,0 +1,339 @@ +//===- UnifiedOnDiskCache.cpp -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one +// directory while also restricting storage growth with a scheme of chaining the +// two most recent directories (primary & upstream), where the primary +// "faults-in" data from the upstream one. When the primary (most recent) +// directory exceeds its intended limit a new empty directory becomes the +// primary one. +// +// Within the top-level directory (the path that \p UnifiedOnDiskCache::open +// receives) there are directories named like this: +// +// 'v.' +// 'v..' +// ... +// +// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and +// the part after the dot is an increasing integer. The primary directory is the +// one with the highest integer and the upstream one is the directory before it. +// For example, if the sub-directories contained are: +// +// 'v1.5', 'v1.6', 'v1.7', 'v1.8' +// +// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are +// unused directories that can be safely deleted at any time and by any process. +// +// Contained within the top-level directory is a file named "lock" which is used +// for processes to take shared or exclusive locks for the contents of the top +// directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock for +// the top-level directory; when it closes, if the primary sub-directory +// exceeded its limit, it attempts to get an exclusive lock in order to create a +// new empty primary directory; if it can't get the exclusive lock it gives up +// and lets the next \p UnifiedOnDiskCache instance that closes to attempt +// again. +// +// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a +// directory, by any process, the storage size in that directory will keep +// growing unrestricted. But the major benefit is that garbage-collection can be +// triggered on a directory concurrently, at any time and by any process, +// without affecting any active readers/writers in the same process or other +// processes. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out +/// how to handle the leftover sub-directories of the previous version, within +/// the \p UnifiedOnDiskCache::collectGarbage function. +static constexpr StringLiteral DBDirPrefix = "v1."; + +Expected UnifiedOnDiskCache::KVPut(ObjectID Key, ObjectID Value) { + return KVPut(PrimaryGraphDB->getDigest(Key), Value); +} + +Expected UnifiedOnDiskCache::KVPut(ArrayRef Key, + ObjectID Value) { + static_assert(sizeof(Value.getOpaqueData()) == sizeof(uint64_t), + "unexpected return opaque type"); + std::array ValBytes; + support::endian::write64le(ValBytes.data(), Value.getOpaqueData()); + Expected> Existing = PrimaryKVDB->put(Key, ValBytes); + if (!Existing) + return Existing.takeError(); + assert(Existing->size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Existing->data())); +} + +Expected> +UnifiedOnDiskCache::KVGet(ArrayRef Key) { + std::optional> Value; + if (Error E = PrimaryKVDB->get(Key).moveInto(Value)) + return std::move(E); + if (!Value) { + if (UpstreamKVDB) + return faultInFromUpstreamKV(Key); + return std::nullopt; + } + assert(Value->size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Value->data())); +} + +Expected> +UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef Key) { + assert(UpstreamGraphDB); + assert(UpstreamKVDB); + + std::optional> UpstreamValue; + if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue)) + return std::move(E); + if (!UpstreamValue) + return std::nullopt; + + // The value is the \p ObjectID in the context of the upstream + // \p OnDiskGraphDB instance. Translate it to the context of the primary + // \p OnDiskGraphDB instance. + assert(UpstreamValue->size() == sizeof(uint64_t)); + ObjectID UpstreamID = ObjectID::fromOpaqueData( + support::endian::read64le(UpstreamValue->data())); + ObjectID PrimaryID = + PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID)); + return KVPut(Key, PrimaryID); +} + +/// \returns all the 'v.' names of sub-directories, sorted with +/// ascending order of the integer after the dot. +static Error getAllDBDirs(StringRef Path, + SmallVectorImpl &DBDirs) { + struct DBDir { + uint64_t Order; + std::string Name; + }; + SmallVector FoundDBDirs; + + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() != sys::fs::file_type::directory_file) + continue; + StringRef SubDir = sys::path::filename(DirI->path()); + if (!SubDir.starts_with(DBDirPrefix)) + continue; + uint64_t Order; + if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order)) + return createStringError(inconvertibleErrorCode(), + "unexpected directory " + DirI->path()); + FoundDBDirs.push_back({Order, std::string(SubDir)}); + } + if (EC) + return createFileError(Path, EC); + + llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { + return LHS.Order <= RHS.Order; + }); + for (DBDir &Dir : FoundDBDirs) + DBDirs.push_back(std::move(Dir.Name)); + return Error::success(); +} + +/// \returns Given a sub-directory named 'v.', it outputs the +/// 'v.' name. +static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) { + assert(DBDir.starts_with(DBDirPrefix)); + uint64_t Count; + bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count); + assert(!Failed); + (void)Failed; + OS << DBDirPrefix << Count + 1; +} + +Expected> +UnifiedOnDiskCache::open(StringRef RootPath, std::optional SizeLimit, + StringRef HashName, unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, "lock"); + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(LockFD != -1); + // Locking the directory using shared lock, which will prevent other processes + // from creating a new chain (essentially while a \p UnifiedOnDiskCache + // instance holds a shared lock the storage for the primary directory will + // grow unrestricted). + if (std::error_code EC = sys::fs::lockFile(LockFD, /*Exclusive=*/false)) + return createFileError(PathBuf, EC); + + SmallVector DBDirs; + if (Error E = getAllDBDirs(RootPath, DBDirs)) + return std::move(E); + if (DBDirs.empty()) + DBDirs.push_back((Twine(DBDirPrefix) + "1").str()); + + assert(!DBDirs.empty()); + + /// If there is only one directory open databases on it. If there are 2 or + /// more directories, get the most recent directories and chain them, with the + /// most recent being the primary one. The remaining directories are unused + /// data than can be garbage-collected. + std::unique_ptr UpstreamGraphDB; + std::unique_ptr UpstreamKVDB; + if (DBDirs.size() > 1) { + StringRef UpstreamDir = *(DBDirs.end() - 2); + PathBuf = RootPath; + sys::path::append(PathBuf, UpstreamDir); + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + /*UpstreamDB=*/nullptr, FaultInPolicy) + .moveInto(UpstreamGraphDB)) + return std::move(E); + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(UpstreamKVDB)) + return std::move(E); + } + OnDiskGraphDB *UpstreamGraphDBPtr = UpstreamGraphDB.get(); + + StringRef PrimaryDir = *(DBDirs.end() - 1); + PathBuf = RootPath; + sys::path::append(PathBuf, PrimaryDir); + std::unique_ptr PrimaryGraphDB; + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + std::move(UpstreamGraphDB), FaultInPolicy) + .moveInto(PrimaryGraphDB)) + return std::move(E); + std::unique_ptr PrimaryKVDB; + // \p UnifiedOnDiskCache does manual chaining for key-value requests, + // including an extra translation step of the value during fault-in. + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(PrimaryKVDB)) + return std::move(E); + + auto UniDB = std::unique_ptr(new UnifiedOnDiskCache()); + UniDB->RootPath = RootPath; + UniDB->SizeLimit = SizeLimit; + UniDB->LockFD = LockFD; + UniDB->NeedsGarbageCollection = DBDirs.size() > 2; + UniDB->PrimaryDBDir = PrimaryDir; + UniDB->UpstreamGraphDB = UpstreamGraphDBPtr; + UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB); + UniDB->UpstreamKVDB = std::move(UpstreamKVDB); + UniDB->PrimaryKVDB = std::move(PrimaryKVDB); + + return std::move(UniDB); +} + +bool UnifiedOnDiskCache::hasExceededSizeLimit() const { + if (!SizeLimit) + return false; + // We allow each of the directories in the chain to reach up to half the + // intended size limit. Check whether the primary directory has exceeded half + // the limit or not, in order to decide whether we need to start a new chain. + // + // We could check the size limit against the sum of sizes of both the primary + // and upstream directories but then if the upstream is significantly larger + // than the intended limit, it would trigger a new chain to be created before + // the primary has reached its own limit. Essentially in such situation we + // prefer reclaiming the storage later in order to have more consistent cache + // hits behavior. + return (*SizeLimit / 2) < + (PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize()); +} + +Error UnifiedOnDiskCache::close(bool CheckSizeLimit) { + if (LockFD == -1) + return Error::success(); // already closed. + auto _1 = make_scope_exit([&]() { + assert(LockFD >= 0); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + sys::fs::closeFile(LockFile); + LockFD = -1; + }); + + bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false; + PrimaryKVDB.reset(); + UpstreamKVDB.reset(); + PrimaryGraphDB.reset(); + UpstreamGraphDB = nullptr; + if (std::error_code EC = sys::fs::unlockFile(LockFD)) + return createFileError(RootPath, EC); + + if (!ExceededSizeLimit) + return Error::success(); + + // The primary directory exceeded its intended size limit. Try to get an + // exclusive lock in order to create a new primary directory for next time + // this \p UnifiedOnDiskCache path is opened. + + if (std::error_code EC = sys::fs::tryLockFile( + LockFD, std::chrono::milliseconds(0), /*Exclusive=*/true)) { + if (EC == errc::no_lock_available) + return Error::success(); // couldn't get exclusive lock, give up. + return createFileError(RootPath, EC); + } + auto _2 = make_scope_exit([&]() { sys::fs::unlockFile(LockFD); }); + + // Managed to get an exclusive lock which means there are no other open + // \p UnifiedOnDiskCache instances for the same path, so we can safely start a + // new primary directory. To start a new primary directory we just have to + // create a new empty directory with the next consecutive index; since this is + // an atomic operation we will leave the top-level directory in a consistent + // state even if the process dies during this code-path. + + SmallString<256> PathBuf(RootPath); + raw_svector_ostream OS(PathBuf); + OS << sys::path::get_separator(); + getNextDBDirName(PrimaryDBDir, OS); + if (std::error_code EC = sys::fs::create_directory(PathBuf)) + return createFileError(PathBuf, EC); + + NeedsGarbageCollection = true; + return Error::success(); +} + +UnifiedOnDiskCache::UnifiedOnDiskCache() = default; + +UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); } + +Error UnifiedOnDiskCache::collectGarbage(StringRef Path) { + SmallVector DBDirs; + if (Error E = getAllDBDirs(Path, DBDirs)) + return E; + if (DBDirs.size() <= 2) + return Error::success(); // no unused directories. + + // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure + // out how to handle the leftover sub-directories of the previous version. + + SmallString<256> PathBuf(Path); + for (StringRef UnusedSubDir : ArrayRef(DBDirs).drop_back(2)) { + sys::path::append(PathBuf, UnusedSubDir); + if (std::error_code EC = sys::fs::remove_directories(PathBuf)) + return createFileError(PathBuf, EC); + sys::path::remove_filename(PathBuf); + } + return Error::success(); +} diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt index 503c77cb13bd07..b06f4ffd83ff5a 100644 --- a/llvm/lib/CMakeLists.txt +++ b/llvm/lib/CMakeLists.txt @@ -9,6 +9,7 @@ add_subdirectory(FileCheck) add_subdirectory(InterfaceStub) add_subdirectory(IRPrinter) add_subdirectory(IRReader) +add_subdirectory(CAS) add_subdirectory(CGData) add_subdirectory(CodeGen) add_subdirectory(CodeGenTypes) diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 97188b0672f032..0625d375bad487 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -248,6 +248,7 @@ add_llvm_component_library(LLVMSupport TimeProfiler.cpp Timer.cpp ToolOutputFile.cpp + TrieRawHashMap.cpp Twine.cpp TypeSize.cpp Unicode.cpp diff --git a/llvm/lib/Support/TrieHashIndexGenerator.h b/llvm/lib/Support/TrieHashIndexGenerator.h new file mode 100644 index 00000000000000..c9e9b70e10d3c7 --- /dev/null +++ b/llvm/lib/Support/TrieHashIndexGenerator.h @@ -0,0 +1,89 @@ +//===- TrieHashIndexGenerator.h ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_SUPPORT_TRIEHASHINDEXGENERATOR_H +#define LLVM_LIB_SUPPORT_TRIEHASHINDEXGENERATOR_H + +#include "llvm/ADT/ArrayRef.h" +#include + +namespace llvm { + +struct IndexGenerator { + size_t NumRootBits; + size_t NumSubtrieBits; + ArrayRef Bytes; + std::optional StartBit = std::nullopt; + + size_t getNumBits() const { + assert(StartBit); + size_t TotalNumBits = Bytes.size() * 8; + assert(*StartBit <= TotalNumBits); + return std::min(*StartBit ? NumSubtrieBits : NumRootBits, + TotalNumBits - *StartBit); + } + size_t next() { + size_t Index; + if (!StartBit) { + StartBit = 0; + Index = getIndex(Bytes, *StartBit, NumRootBits); + } else { + *StartBit += *StartBit ? NumSubtrieBits : NumRootBits; + assert((*StartBit - NumRootBits) % NumSubtrieBits == 0); + Index = getIndex(Bytes, *StartBit, NumSubtrieBits); + } + return Index; + } + + size_t hint(unsigned Index, unsigned Bit) { + assert(Index >= 0); + assert(Bit < Bytes.size() * 8); + assert(Bit == 0 || (Bit - NumRootBits) % NumSubtrieBits == 0); + StartBit = Bit; + return Index; + } + + size_t getCollidingBits(ArrayRef CollidingBits) const { + assert(StartBit); + return getIndex(CollidingBits, *StartBit, NumSubtrieBits); + } + + static size_t getIndex(ArrayRef Bytes, size_t StartBit, + size_t NumBits) { + assert(StartBit < Bytes.size() * 8); + + Bytes = Bytes.drop_front(StartBit / 8u); + StartBit %= 8u; + size_t Index = 0; + for (uint8_t Byte : Bytes) { + size_t ByteStart = 0, ByteEnd = 8; + if (StartBit) { + ByteStart = StartBit; + Byte &= (1u << (8 - StartBit)) - 1u; + StartBit = 0; + } + size_t CurrentNumBits = ByteEnd - ByteStart; + if (CurrentNumBits > NumBits) { + Byte >>= CurrentNumBits - NumBits; + CurrentNumBits = NumBits; + } + Index <<= CurrentNumBits; + Index |= Byte & ((1u << CurrentNumBits) - 1u); + + assert(NumBits >= CurrentNumBits); + NumBits -= CurrentNumBits; + if (!NumBits) + break; + } + return Index; + } +}; + +} // namespace llvm + +#endif // LLVM_LIB_SUPPORT_TRIEHASHINDEXGENERATOR_H diff --git a/llvm/lib/Support/TrieRawHashMap.cpp b/llvm/lib/Support/TrieRawHashMap.cpp new file mode 100644 index 00000000000000..a6818d434e0d13 --- /dev/null +++ b/llvm/lib/Support/TrieRawHashMap.cpp @@ -0,0 +1,483 @@ +//===- TrieRawHashMap.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/TrieRawHashMap.h" +#include "TrieHashIndexGenerator.h" +#include "llvm/ADT/LazyAtomicPointer.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ThreadSafeAllocator.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +namespace { +struct TrieNode { + const bool IsSubtrie = false; + + TrieNode(bool IsSubtrie) : IsSubtrie(IsSubtrie) {} + + static void *operator new(size_t Size) { return ::malloc(Size); } + void operator delete(void *Ptr) { ::free(Ptr); } +}; + +struct TrieContent final : public TrieNode { + const uint8_t ContentOffset; + const uint8_t HashSize; + const uint8_t HashOffset; + + void *getValuePointer() const { + auto Content = reinterpret_cast(this) + ContentOffset; + return const_cast(Content); + } + + ArrayRef getHash() const { + auto *Begin = reinterpret_cast(this) + HashOffset; + return ArrayRef(Begin, Begin + HashSize); + } + + TrieContent(size_t ContentOffset, size_t HashSize, size_t HashOffset) + : TrieNode(/*IsSubtrie=*/false), ContentOffset(ContentOffset), + HashSize(HashSize), HashOffset(HashOffset) {} +}; +static_assert(sizeof(TrieContent) == + ThreadSafeTrieRawHashMapBase::TrieContentBaseSize, + "Check header assumption!"); + +class TrieSubtrie final : public TrieNode { +public: + TrieNode *get(size_t I) const { return Slots[I].load(); } + + TrieSubtrie * + sink(size_t I, TrieContent &Content, size_t NumSubtrieBits, size_t NewI, + function_ref)> Saver); + + static std::unique_ptr create(size_t StartBit, size_t NumBits); + + explicit TrieSubtrie(size_t StartBit, size_t NumBits); + +private: + // FIXME: Use a bitset to speed up access: + // + // std::array, NumSlots/64> IsSet; + // + // This will avoid needing to visit sparsely filled slots in + // \a ThreadSafeTrieRawHashMapBase::destroyImpl() when there's a non-trivial + // destructor. + // + // It would also greatly speed up iteration, if we add that some day, and + // allow get() to return one level sooner. + // + // This would be the algorithm for updating IsSet (after updating Slots): + // + // std::atomic &Bits = IsSet[I.High]; + // const uint64_t NewBit = 1ULL << I.Low; + // uint64_t Old = 0; + // while (!Bits.compare_exchange_weak(Old, Old | NewBit)) + // ; + + // For debugging. + unsigned StartBit = 0; + unsigned NumBits = 0; + friend class llvm::ThreadSafeTrieRawHashMapBase; + +public: + /// Linked list for ownership of tries. The pointer is owned by TrieSubtrie. + std::atomic Next; + + /// The (co-allocated) slots of the subtrie. + MutableArrayRef> Slots; +}; +} // end namespace + +namespace llvm { +template <> struct isa_impl { + static inline bool doit(const TrieNode &TN) { return !TN.IsSubtrie; } +}; +template <> struct isa_impl { + static inline bool doit(const TrieNode &TN) { return TN.IsSubtrie; } +}; +} // end namespace llvm + +static size_t getTrieTailSize(size_t StartBit, size_t NumBits) { + assert(NumBits < 20 && "Tries should have fewer than ~1M slots"); + return sizeof(TrieNode *) * (1u << NumBits); +} + +std::unique_ptr TrieSubtrie::create(size_t StartBit, + size_t NumBits) { + size_t Size = sizeof(TrieSubtrie) + getTrieTailSize(StartBit, NumBits); + void *Memory = ::malloc(Size); + TrieSubtrie *S = ::new (Memory) TrieSubtrie(StartBit, NumBits); + return std::unique_ptr(S); +} + +TrieSubtrie::TrieSubtrie(size_t StartBit, size_t NumBits) + : TrieNode(true), StartBit(StartBit), NumBits(NumBits), Next(nullptr), + Slots(reinterpret_cast *>( + reinterpret_cast(this) + sizeof(TrieSubtrie)), + (1u << NumBits)) { + for (auto *I = Slots.begin(), *E = Slots.end(); I != E; ++I) + new (I) LazyAtomicPointer(nullptr); + + static_assert( + std::is_trivially_destructible>::value, + "Expected no work in destructor for TrieNode"); +} + +TrieSubtrie *TrieSubtrie::sink( + size_t I, TrieContent &Content, size_t NumSubtrieBits, size_t NewI, + function_ref)> Saver) { + assert(NumSubtrieBits > 0); + std::unique_ptr S = create(StartBit + NumBits, NumSubtrieBits); + + assert(NewI < S->Slots.size()); + S->Slots[NewI].store(&Content); + + TrieNode *ExistingNode = &Content; + assert(I < Slots.size()); + if (Slots[I].compare_exchange_strong(ExistingNode, S.get())) + return Saver(std::move(S)); + + // Another thread created a subtrie already. Return it and let "S" be + // destructed. + return cast(ExistingNode); +} + +struct ThreadSafeTrieRawHashMapBase::ImplType { + static std::unique_ptr create(size_t StartBit, size_t NumBits) { + size_t Size = sizeof(ImplType) + getTrieTailSize(StartBit, NumBits); + void *Memory = ::malloc(Size); + ImplType *Impl = ::new (Memory) ImplType(StartBit, NumBits); + return std::unique_ptr(Impl); + } + + TrieSubtrie *save(std::unique_ptr S) { + assert(!S->Next && "Expected S to a freshly-constructed leaf"); + + TrieSubtrie *CurrentHead = nullptr; + // Add ownership of "S" to front of the list, so that Root -> S -> + // Root.Next. This works by repeatedly setting S->Next to a candidate value + // of Root.Next (initially nullptr), then setting Root.Next to S once the + // candidate matches reality. + while (!Root.Next.compare_exchange_weak(CurrentHead, S.get())) + S->Next.exchange(CurrentHead); + + // Ownership transferred to subtrie. + return S.release(); + } + + static void *operator new(size_t Size) { return ::malloc(Size); } + void operator delete(void *Ptr) { ::free(Ptr); } + + /// FIXME: This should take a function that allocates and constructs the + /// content lazily (taking the hash as a separate parameter), in case of + /// collision. + ThreadSafeAllocator ContentAlloc; + TrieSubtrie Root; // Must be last! Tail-allocated. + +private: + ImplType(size_t StartBit, size_t NumBits) : Root(StartBit, NumBits) {} +}; + +ThreadSafeTrieRawHashMapBase::ImplType & +ThreadSafeTrieRawHashMapBase::getOrCreateImpl() { + if (ImplType *Impl = ImplPtr.load()) + return *Impl; + + // Create a new ImplType and store it if another thread doesn't do so first. + // If another thread wins this one is destroyed locally. + std::unique_ptr Impl = ImplType::create(0, NumRootBits); + ImplType *ExistingImpl = nullptr; + if (ImplPtr.compare_exchange_strong(ExistingImpl, Impl.get())) + return *Impl.release(); + + return *ExistingImpl; +} + +ThreadSafeTrieRawHashMapBase::PointerBase +ThreadSafeTrieRawHashMapBase::find(ArrayRef Hash) const { + assert(!Hash.empty() && "Uninitialized hash"); + + ImplType *Impl = ImplPtr.load(); + if (!Impl) + return PointerBase(); + + TrieSubtrie *S = &Impl->Root; + IndexGenerator IndexGen{NumRootBits, NumSubtrieBits, Hash}; + size_t Index = IndexGen.next(); + while (true) { + // Try to set the content. + TrieNode *Existing = S->get(Index); + if (!Existing) + return PointerBase(S, Index, *IndexGen.StartBit); + + // Check for an exact match. + if (auto *ExistingContent = dyn_cast(Existing)) + return ExistingContent->getHash() == Hash + ? PointerBase(ExistingContent->getValuePointer()) + : PointerBase(S, Index, *IndexGen.StartBit); + + Index = IndexGen.next(); + S = cast(Existing); + } +} + +ThreadSafeTrieRawHashMapBase::PointerBase ThreadSafeTrieRawHashMapBase::insert( + PointerBase Hint, ArrayRef Hash, + function_ref Hash)> + Constructor) { + assert(!Hash.empty() && "Uninitialized hash"); + + ImplType &Impl = getOrCreateImpl(); + TrieSubtrie *S = &Impl.Root; + IndexGenerator IndexGen{NumRootBits, NumSubtrieBits, Hash}; + size_t Index; + if (Hint.isHint()) { + S = static_cast(Hint.P); + Index = IndexGen.hint(Hint.I, Hint.B); + } else { + Index = IndexGen.next(); + } + + while (true) { + // Load the node from the slot, allocating and calling the constructor if + // the slot is empty. + bool Generated = false; + TrieNode &Existing = S->Slots[Index].loadOrGenerate([&]() { + Generated = true; + + // Construct the value itself at the tail. + uint8_t *Memory = reinterpret_cast( + Impl.ContentAlloc.Allocate(ContentAllocSize, ContentAllocAlign)); + const uint8_t *HashStorage = Constructor(Memory + ContentOffset, Hash); + + // Construct the TrieContent header, passing in the offset to the hash. + TrieContent *Content = ::new (Memory) + TrieContent(ContentOffset, Hash.size(), HashStorage - Memory); + assert(Hash == Content->getHash() && "Hash not properly initialized"); + return Content; + }); + // If we just generated it, return it! + if (Generated) + return PointerBase(cast(Existing).getValuePointer()); + + if (auto *ST = dyn_cast(&Existing)) { + S = ST; + Index = IndexGen.next(); + continue; + } + + // Return the existing content if it's an exact match! + auto &ExistingContent = cast(Existing); + if (ExistingContent.getHash() == Hash) + return PointerBase(ExistingContent.getValuePointer()); + + // Sink the existing content as long as the indexes match. + while (true) { + size_t NextIndex = IndexGen.next(); + size_t NewIndexForExistingContent = + IndexGen.getCollidingBits(ExistingContent.getHash()); + S = S->sink(Index, ExistingContent, IndexGen.getNumBits(), + NewIndexForExistingContent, + [&Impl](std::unique_ptr S) { + return Impl.save(std::move(S)); + }); + Index = NextIndex; + + // Found the difference. + if (NextIndex != NewIndexForExistingContent) + break; + } + } +} + +ThreadSafeTrieRawHashMapBase::ThreadSafeTrieRawHashMapBase( + size_t ContentAllocSize, size_t ContentAllocAlign, size_t ContentOffset, + std::optional NumRootBits, std::optional NumSubtrieBits) + : ContentAllocSize(ContentAllocSize), ContentAllocAlign(ContentAllocAlign), + ContentOffset(ContentOffset), + NumRootBits(NumRootBits ? *NumRootBits : DefaultNumRootBits), + NumSubtrieBits(NumSubtrieBits ? *NumSubtrieBits : DefaultNumSubtrieBits), + ImplPtr(nullptr) { + assert((!NumRootBits || *NumRootBits < 20) && + "Root should have fewer than ~1M slots"); + assert((!NumSubtrieBits || *NumSubtrieBits < 10) && + "Subtries should have fewer than ~1K slots"); +} + +ThreadSafeTrieRawHashMapBase::ThreadSafeTrieRawHashMapBase( + ThreadSafeTrieRawHashMapBase &&RHS) + : ContentAllocSize(RHS.ContentAllocSize), + ContentAllocAlign(RHS.ContentAllocAlign), + ContentOffset(RHS.ContentOffset), NumRootBits(RHS.NumRootBits), + NumSubtrieBits(RHS.NumSubtrieBits) { + // Steal the root from RHS. + ImplPtr = RHS.ImplPtr.exchange(nullptr); +} + +ThreadSafeTrieRawHashMapBase::~ThreadSafeTrieRawHashMapBase() { + assert(!ImplPtr.load() && "Expected subclass to call destroyImpl()"); +} + +void ThreadSafeTrieRawHashMapBase::destroyImpl( + function_ref Destructor) { + std::unique_ptr Impl(ImplPtr.exchange(nullptr)); + if (!Impl) + return; + + // Destroy content nodes throughout trie. Avoid destroying any subtries since + // we need TrieNode::classof() to find the content nodes. + // + // FIXME: Once we have bitsets (see FIXME in TrieSubtrie class), use them + // facilitate sparse iteration here. + if (Destructor) + for (TrieSubtrie *Trie = &Impl->Root; Trie; Trie = Trie->Next.load()) + for (auto &Slot : Trie->Slots) + if (auto *Content = dyn_cast_or_null(Slot.load())) + Destructor(Content->getValuePointer()); + + // Destroy the subtries. Incidentally, this destroys them in the reverse order + // of saving. + TrieSubtrie *Trie = Impl->Root.Next; + while (Trie) { + TrieSubtrie *Next = Trie->Next.exchange(nullptr); + delete Trie; + Trie = Next; + } +} + +ThreadSafeTrieRawHashMapBase::PointerBase +ThreadSafeTrieRawHashMapBase::getRoot() const { + ImplType *Impl = ImplPtr.load(); + if (!Impl) + return PointerBase(); + return PointerBase(&Impl->Root); +} + +unsigned ThreadSafeTrieRawHashMapBase::getStartBit( + ThreadSafeTrieRawHashMapBase::PointerBase P) const { + assert(!P.isHint() && "Not a valid trie"); + if (!P.P) + return 0; + if (auto *S = dyn_cast((TrieNode *)P.P)) + return S->StartBit; + return 0; +} + +unsigned ThreadSafeTrieRawHashMapBase::getNumBits( + ThreadSafeTrieRawHashMapBase::PointerBase P) const { + assert(!P.isHint() && "Not a valid trie"); + if (!P.P) + return 0; + if (auto *S = dyn_cast((TrieNode *)P.P)) + return S->NumBits; + return 0; +} + +unsigned ThreadSafeTrieRawHashMapBase::getNumSlotUsed( + ThreadSafeTrieRawHashMapBase::PointerBase P) const { + assert(!P.isHint() && "Not a valid trie"); + if (!P.P) + return 0; + auto *S = dyn_cast((TrieNode *)P.P); + if (!S) + return 0; + unsigned Num = 0; + for (unsigned I = 0, E = S->Slots.size(); I < E; ++I) + if (auto *E = S->Slots[I].load()) + ++Num; + return Num; +} + +std::string ThreadSafeTrieRawHashMapBase::getTriePrefixAsString( + ThreadSafeTrieRawHashMapBase::PointerBase P) const { + assert(!P.isHint() && "Not a valid trie"); + if (!P.P) + return ""; + + auto *S = dyn_cast((TrieNode *)P.P); + if (!S || !S->IsSubtrie) + return ""; + + // Find a TrieContent node which has hash stored. Depth search following the + // first used slot until a TrieContent node is found. + TrieSubtrie *Current = S; + TrieContent *Node = nullptr; + while (Current) { + TrieSubtrie *Next = nullptr; + // find first used slot in the trie. + for (unsigned I = 0, E = Current->Slots.size(); I < E; ++I) { + auto *S = Current->get(I); + if (!S) + continue; + + if (auto *Content = dyn_cast(S)) + Node = Content; + else if (auto *Sub = dyn_cast(S)) + Next = Sub; + break; + } + + // Found the node. + if (Node) + break; + + // Continue to the next level if the node is not found. + Current = Next; + } + + assert(Node && "malformed trie, cannot find TrieContent on leaf node"); + // The prefix for the current trie is the first `StartBit` of the content + // stored underneath this subtrie. + std::string Str; + raw_string_ostream SS(Str); + + unsigned StartFullBytes = (S->StartBit + 1) / 8 - 1; + SS << toHex(toStringRef(Node->getHash()).take_front(StartFullBytes), + /*LowerCase=*/true); + + // For the part of the prefix that doesn't fill a byte, print raw bit values. + std::string Bits; + for (unsigned I = StartFullBytes * 8, E = S->StartBit; I < E; ++I) { + unsigned Index = I / 8; + unsigned Offset = 7 - I % 8; + Bits.push_back('0' + ((Node->getHash()[Index] >> Offset) & 1)); + } + + if (!Bits.empty()) + SS << "[" << Bits << "]"; + + return SS.str(); +} + +unsigned ThreadSafeTrieRawHashMapBase::getNumTries() const { + ImplType *Impl = ImplPtr.load(); + if (!Impl) + return 0; + unsigned Num = 0; + for (TrieSubtrie *Trie = &Impl->Root; Trie; Trie = Trie->Next.load()) + ++Num; + return Num; +} + +ThreadSafeTrieRawHashMapBase::PointerBase +ThreadSafeTrieRawHashMapBase::getNextTrie( + ThreadSafeTrieRawHashMapBase::PointerBase P) const { + assert(!P.isHint() && "Not a valid trie"); + if (!P.P) + return PointerBase(); + auto *S = dyn_cast((TrieNode *)P.P); + if (!S) + return PointerBase(); + if (auto *E = S->Next.load()) + return PointerBase(E); + return PointerBase(); +} diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index 44097bad7b46ed..9f6f15bbd05f2d 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -1223,13 +1223,14 @@ Expected readNativeFileSlice(file_t FD, MutableArrayRef Buf, return NumRead; } -std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { +std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout, + bool Exclusive) { auto Start = std::chrono::steady_clock::now(); auto End = Start + Timeout; do { struct flock Lock; memset(&Lock, 0, sizeof(Lock)); - Lock.l_type = F_WRLCK; + Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK; Lock.l_whence = SEEK_SET; Lock.l_start = 0; Lock.l_len = 0; @@ -1238,15 +1239,17 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { int Error = errno; if (Error != EACCES && Error != EAGAIN) return std::error_code(Error, std::generic_category()); + if (Timeout.count() == 0) + break; usleep(1000); } while (std::chrono::steady_clock::now() < End); return make_error_code(errc::no_lock_available); } -std::error_code lockFile(int FD) { +std::error_code lockFile(int FD, bool Exclusive) { struct flock Lock; memset(&Lock, 0, sizeof(Lock)); - Lock.l_type = F_WRLCK; + Lock.l_type = Exclusive ? F_WRLCK : F_RDLCK; Lock.l_whence = SEEK_SET; Lock.l_start = 0; Lock.l_len = 0; diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index c4bd5e24723517..07ee3d96be5ec6 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -1327,8 +1327,10 @@ Expected readNativeFileSlice(file_t FileHandle, return readNativeFileImpl(FileHandle, Buf, &Overlapped); } -std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { - DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY; +std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout, + bool Exclusive) { + DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0; + Flags |= LOCKFILE_FAIL_IMMEDIATELY; OVERLAPPED OV = {}; file_t File = convertFDToNativeFile(FD); auto Start = std::chrono::steady_clock::now(); @@ -1338,6 +1340,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { return std::error_code(); DWORD Error = ::GetLastError(); if (Error == ERROR_LOCK_VIOLATION) { + if (Timeout.count() == 0) + break; ::Sleep(1); continue; } @@ -1346,8 +1350,8 @@ std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { return mapWindowsError(ERROR_LOCK_VIOLATION); } -std::error_code lockFile(int FD) { - DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK; +std::error_code lockFile(int FD, bool Exclusive) { + DWORD Flags = Exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0; OVERLAPPED OV = {}; file_t File = convertFDToNativeFile(FD); if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV)) diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index c66075434f1583..44032bec48ec12 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -23,6 +23,7 @@ llvm_canonicalize_cmake_booleans( LLVM_ENABLE_EXPENSIVE_CHECKS LLVM_ENABLE_REVERSE_ITERATION LLVM_INCLUDE_DXIL_TESTS + LLVM_ENABLE_ONDISK_CAS LLVM_TOOL_LLVM_DRIVER_BUILD LLVM_INCLUDE_SPIRV_SIMULATOR_TESTS LLVM_INCLUDE_SPIRV_TOOLS_TESTS @@ -59,6 +60,7 @@ configure_lit_site_cfg( # NOTE: Sync the substitutions in test/lit.cfg when adding to this list. set(LLVM_TEST_DEPENDS BugpointPasses + CASPluginTest FileCheck LLVMWindowsDriver UnitTests @@ -73,6 +75,7 @@ set(LLVM_TEST_DEPENDS llvm-bcanalyzer llvm-bitcode-strip llvm-c-test + llvm-cas llvm-cat llvm-cfi-verify llvm-cgdata diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 5a03a85386e0aa..4ca73a3c02dc73 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -615,9 +615,17 @@ def have_ld64_plugin_support(): if config.expensive_checks: config.available_features.add("expensive_checks") +if config.have_ondisk_cas: + config.available_features.add("ondisk_cas") + if "MemoryWithOrigins" in config.llvm_use_sanitizer: config.available_features.add("use_msan_with_origins") +# Restrict the size of the on-disk CAS for tests. This allows testing in +# constrained environments (e.g. small TMPDIR). It also prevents leaving +# behind large files on file systems that do not support sparse files if a test +# crashes before resizing the file. +config.environment["LLVM_CAS_MAX_MAPPING_SIZE"] = "%d" % (100 * 1024 * 1024) # Some tools support an environment variable "OBJECT_MODE" on AIX OS, which # controls the kind of objects they will support. If there is no "OBJECT_MODE" diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 0968f6214772d0..3cf951be5f71e0 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -59,6 +59,7 @@ config.llvm_raevict_model_autogenerated = @LLVM_RAEVICT_MODEL_AUTOGENERATED@ config.expensive_checks = @LLVM_ENABLE_EXPENSIVE_CHECKS@ config.reverse_iteration = @LLVM_ENABLE_REVERSE_ITERATION@ config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@ +config.have_ondisk_cas = @LLVM_ENABLE_ONDISK_CAS@ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@ config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@ config.have_vc_rev = @LLVM_APPEND_VC_REV@ diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline b/llvm/test/tools/llvm-cas/Inputs/oneline new file mode 100644 index 00000000000000..d95f3ad14dee63 --- /dev/null +++ b/llvm/test/tools/llvm-cas/Inputs/oneline @@ -0,0 +1 @@ +content diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline new file mode 100644 index 00000000000000..6b584e8ece562e --- /dev/null +++ b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline @@ -0,0 +1 @@ +content \ No newline at end of file diff --git a/llvm/test/tools/llvm-cas/cache.test b/llvm/test/tools/llvm-cas/cache.test new file mode 100644 index 00000000000000..f0ce69190d4182 --- /dev/null +++ b/llvm/test/tools/llvm-cas/cache.test @@ -0,0 +1,14 @@ +RUN: rm -rf %t %t.cas +RUN: mkdir %t + +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data /dev/null > %t/empty.casid +RUN: echo "abc" | \ +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data - >%t/abc.casid + +RUN: llvm-cas --cas %t/cas --put-cache-key @%t/abc.casid @%t/empty.casid +RUN: llvm-cas --cas %t/cas --get-cache-result @%t/abc.casid > %t/empty2.casid +RUN: diff %t/empty.casid %t/empty2.casid + +RUN: not llvm-cas --cas %t/cas --get-cache-result @%t/empty.casid diff --git a/llvm/test/tools/llvm-cas/lit.local.cfg b/llvm/test/tools/llvm-cas/lit.local.cfg new file mode 100644 index 00000000000000..379945b68925df --- /dev/null +++ b/llvm/test/tools/llvm-cas/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.have_ondisk_cas: + config.unsupported = True diff --git a/llvm/test/tools/llvm-cas/make-blob.test b/llvm/test/tools/llvm-cas/make-blob.test new file mode 100644 index 00000000000000..10c64732ceb901 --- /dev/null +++ b/llvm/test/tools/llvm-cas/make-blob.test @@ -0,0 +1,42 @@ +RUN: rm -rf %t %t.cas +RUN: mkdir %t + +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data - %t/empty.casid +RUN: sed -e 's,^.,CHECK: ,' <%t/empty.casid >%t/empty.check +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data /dev/null | FileCheck %t/empty.check +RUN: echo "abc" | \ +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data - >%t/abc.casid +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data %S/Inputs/oneline >%t/oneline.casid +RUN: llvm-cas --cas %t.cas --make-blob \ +RUN: --data %S/Inputs/oneline-nonewline >%t/oneline-nonewline.casid + +RUN: llvm-cas --cas %t.cas --cat-blob @%t/empty.casid |\ +RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty +RUN: llvm-cas --cas %t.cas --print-kind @%t/empty.casid |\ +RUN: FileCheck %s -check-prefix CHECK-KIND +CHECK-EMPTY-NOT: {{.}} +CHECK-KIND: object + +RUN: llvm-cas --cas %t.cas --cat-blob @%t/abc.casid |\ +RUN: FileCheck %s -check-prefix CHECK-ABC +RUN: llvm-cas --cas %t.cas --print-kind @%t/abc.casid |\ +RUN: FileCheck %s -check-prefix CHECK-KIND +CHECK-ABC: abc + +RUN: llvm-cas --cas %t.cas --cat-blob @%t/oneline-nonewline.casid |\ +RUN: FileCheck %s -check-prefix CHECK-ONELINE +RUN: llvm-cas --cas %t.cas --cat-blob @%t/oneline.casid |\ +RUN: FileCheck %s -check-prefix CHECK-ONELINE +CHECK-ONELINE: content + +# Double-check newlines. +RUN: llvm-cas --cas %t.cas --cat-blob @%t/oneline-nonewline.casid \ +RUN: >%t/oneline-nonewline +RUN: diff %S/Inputs/oneline-nonewline %t/oneline-nonewline +RUN: llvm-cas --cas %t.cas --cat-blob @%t/oneline.casid \ +RUN: >%t/oneline +RUN: diff %S/Inputs/oneline %t/oneline diff --git a/llvm/test/tools/llvm-cas/make-node.test b/llvm/test/tools/llvm-cas/make-node.test new file mode 100644 index 00000000000000..876afd89c69621 --- /dev/null +++ b/llvm/test/tools/llvm-cas/make-node.test @@ -0,0 +1,40 @@ +RUN: rm -rf %t +RUN: mkdir %t + +# Make some empty objects. +RUN: llvm-cas --cas %t/cas --make-node \ +RUN: --data - %t/empty.casid + +RUN: llvm-cas --cas %t/cas --cat-node-data @%t/empty.casid |\ +RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty +RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/empty.casid |\ +RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty +CHECK-EMPTY-NOT: {{.}} + +RUN: llvm-cas --cas %t/cas --print-kind @%t/empty.casid |\ +RUN: FileCheck %s -check-prefix CHECK-NO-KIND +### FIXME: Node ObjectKind with no reference is Blob kind in BuiltinCAS. +CHECK-NO-KIND: object + +# Make a complex object, which references existing ones. Reference a blob and +# other objects, and reference one of them twice to be sure they don't get +# deduped. +RUN: llvm-cas --cas %t/cas --make-blob --data /dev/null \ +RUN: >%t/empty-blob.casid +RUN: cat %t/empty.casid %t/empty.casid %t/empty-blob.casid \ +RUN: >%t/complex.refs +RUN: cat %t/complex.refs | sed -e 's,^.,CHECK: ,' > %t/complex.check +RUN: llvm-cas --cas %t/cas --make-node \ +RUN: --data %S/Inputs/oneline @%t/complex.refs \ +RUN: >%t/complex.casid +RUN: llvm-cas --cas %t/cas --print-kind \ +RUN: @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-KIND +RUN: llvm-cas --cas %t/cas --cat-node-data \ +RUN: @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-DATA +RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/complex.casid |\ +RUN: FileCheck %t/complex.check +COMPLEX-KIND: object +COMPLEX-DATA: content + +RUN: not llvm-cas --cas %t/cas --ls-tree @%t/complex.casid 2>&1 | FileCheck %s --check-prefix=CHECK-WRONG-TYPE +CHECK-WRONG-TYPE: llvm-cas: ls-tree: not a tree object diff --git a/llvm/test/tools/llvm-cas/print-id.test b/llvm/test/tools/llvm-cas/print-id.test new file mode 100644 index 00000000000000..d0d1be498524e1 --- /dev/null +++ b/llvm/test/tools/llvm-cas/print-id.test @@ -0,0 +1,13 @@ +RUN: rm -rf %t +RUN: mkdir %t + +RUN: llvm-cas --cas %t/cas --make-blob --data %s > %t/id + +# Confirm that the ID has the right prefix, is well-formed, and that there's +# nothing else on the line. +RUN: FileCheck %s --match-full-lines --strict-whitespace <%t/id +CHECK:llvmcas://{{[a-z0-9]+}} + +# Confirm that there's a newline after. +RUN: wc -l <%t/id | FileCheck %s -check-prefix=NEWLINE +NEWLINE: 1 diff --git a/llvm/tools/libCASPluginTest/CMakeLists.txt b/llvm/tools/libCASPluginTest/CMakeLists.txt new file mode 100644 index 00000000000000..aaeba14fb40fb4 --- /dev/null +++ b/llvm/tools/libCASPluginTest/CMakeLists.txt @@ -0,0 +1,12 @@ +set(LLVM_LINK_COMPONENTS + CAS + Support + ) + +set(SOURCES + libCASPluginTest.cpp + ) + +set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/libCASPluginTest.exports) + +add_llvm_library(CASPluginTest SHARED ${SOURCES}) diff --git a/llvm/tools/libCASPluginTest/libCASPluginTest.cpp b/llvm/tools/libCASPluginTest/libCASPluginTest.cpp new file mode 100644 index 00000000000000..0d9443da9d39f1 --- /dev/null +++ b/llvm/tools/libCASPluginTest/libCASPluginTest.cpp @@ -0,0 +1,572 @@ +//===- llvm/tools/libCASPluginTest/libCASPluginTest.cpp ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of the LLVM CAS plugin API, for testing purposes. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/CAS/PluginAPI_functions.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/CBindingWrapping.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/ThreadPool.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; +using namespace llvm::cas::ondisk; + +static char *copyNewMallocString(StringRef Str) { + char *c_str = (char *)malloc(Str.size() + 1); + std::uninitialized_copy(Str.begin(), Str.end(), c_str); + c_str[Str.size()] = '\0'; + return c_str; +} + +template +static ResT reportError(Error &&E, char **error, ResT Result = ResT()) { + if (error) + *error = copyNewMallocString(toString(std::move(E))); + return Result; +} + +void llcas_get_plugin_version(unsigned *major, unsigned *minor) { + *major = LLCAS_VERSION_MAJOR; + *minor = LLCAS_VERSION_MINOR; +} + +void llcas_string_dispose(char *str) { free(str); } + +namespace { + +struct CASPluginOptions { + std::string OnDiskPath; + std::string UpstreamPath; + std::string FirstPrefix; + std::string SecondPrefix; + bool SimulateMissingObjects = false; + bool Logging = true; + + Error setOption(StringRef Name, StringRef Value); +}; + +DEFINE_SIMPLE_CONVERSION_FUNCTIONS(CASPluginOptions, llcas_cas_options_t) + +} // namespace + +Error CASPluginOptions::setOption(StringRef Name, StringRef Value) { + if (Name == "first-prefix") + FirstPrefix = Value; + else if (Name == "second-prefix") + SecondPrefix = Value; + else if (Name == "upstream-path") + UpstreamPath = Value; + else if (Name == "simulate-missing-objects") + SimulateMissingObjects = true; + else if (Name == "no-logging") + Logging = false; + else + return createStringError(errc::invalid_argument, + Twine("unknown option: ") + Name); + return Error::success(); +} + +llcas_cas_options_t llcas_cas_options_create(void) { + return wrap(new CASPluginOptions()); +} + +void llcas_cas_options_dispose(llcas_cas_options_t c_opts) { + delete unwrap(c_opts); +} + +void llcas_cas_options_set_ondisk_path(llcas_cas_options_t c_opts, + const char *path) { + auto &Opts = *unwrap(c_opts); + Opts.OnDiskPath = path; +} + +bool llcas_cas_options_set_option(llcas_cas_options_t c_opts, const char *name, + const char *value, char **error) { + auto &Opts = *unwrap(c_opts); + if (Error E = Opts.setOption(name, value)) + return reportError(std::move(E), error, true); + return false; +} + +namespace { + +struct CASWrapper { + std::string FirstPrefix; + std::string SecondPrefix; + /// If true, asynchronous "download" of an object will treat it as missing. + bool SimulateMissingObjects = false; + bool Logging = true; + std::unique_ptr DB; + /// Used for testing the \c globally parameter of action cache APIs. Simulates + /// "uploading"/"downloading" objects from/to the primary on-disk path. + std::unique_ptr UpstreamDB; + StdThreadPool Pool{llvm::hardware_concurrency()}; + + std::mutex Lock{}; + + /// Check if the object is contained, in the "local" CAS only or "globally". + bool containsObject(ObjectID ID, bool Globally); + + /// Load the object, potentially "downloading" it from upstream. + Expected> loadObject(ObjectID ID); + + /// "Uploads" a key and the associated full node graph. + Error upstreamKey(ArrayRef Key, ObjectID Value); + + /// "Downloads" the ID associated with the key but not the node data. The node + /// itself and the rest of the nodes in the graph will be "downloaded" lazily + /// as they are visited. + Expected> downstreamKey(ArrayRef Key); + + /// Synchronized access to \c llvm::errs(). + void syncErrs(llvm::function_ref Fn) { + if (!Logging) { + // Ignore log output. + SmallString<32> Buf; + raw_svector_ostream OS(Buf); + Fn(OS); + return; + } + std::unique_lock LockGuard(Lock); + Fn(errs()); + errs().flush(); + } + +private: + /// "Uploads" the full object node graph. + Expected upstreamNode(ObjectID Node); + /// "Downloads" only a single object node. The rest of the nodes in the graph + /// will be "downloaded" lazily as they are visited. + Expected downstreamNode(ObjectID Node); +}; + +DEFINE_SIMPLE_CONVERSION_FUNCTIONS(CASWrapper, llcas_cas_t) + +} // namespace + +bool CASWrapper::containsObject(ObjectID ID, bool Globally) { + if (DB->getGraphDB().containsObject(ID)) + return true; + if (!Globally || !UpstreamDB) + return false; + + ObjectID UpstreamID = + UpstreamDB->getGraphDB().getReference(DB->getGraphDB().getDigest(ID)); + return UpstreamDB->getGraphDB().containsObject(UpstreamID); +} + +Expected> +CASWrapper::loadObject(ObjectID ID) { + std::optional Obj; + if (Error E = DB->getGraphDB().load(ID).moveInto(Obj)) + return std::move(E); + if (Obj) + return Obj; + if (!UpstreamDB) + return std::nullopt; + + // Try "downloading" the node from upstream. + ObjectID UpstreamID = + UpstreamDB->getGraphDB().getReference(DB->getGraphDB().getDigest(ID)); + std::optional Ret; + if (Error E = downstreamNode(UpstreamID).moveInto(Ret)) + return std::move(E); + return DB->getGraphDB().load(ID); +} + +/// Imports a single object node. +static Expected importNode(ObjectID FromID, OnDiskGraphDB &FromDB, + OnDiskGraphDB &ToDB) { + ObjectID ToID = ToDB.getReference(FromDB.getDigest(FromID)); + if (ToDB.containsObject(ToID)) + return ToID; + + std::optional FromH; + if (Error E = FromDB.load(FromID).moveInto(FromH)) + return std::move(E); + if (!FromH) + return ToID; + + auto Data = FromDB.getObjectData(*FromH); + auto FromRefs = FromDB.getObjectRefs(*FromH); + SmallVector Refs; + for (ObjectID FromRef : FromRefs) + Refs.push_back(ToDB.getReference(FromDB.getDigest(FromRef))); + + if (Error E = ToDB.store(ToID, Refs, Data)) + return std::move(E); + return ToID; +} + +Expected CASWrapper::upstreamNode(ObjectID Node) { + OnDiskGraphDB &FromDB = DB->getGraphDB(); + OnDiskGraphDB &ToDB = UpstreamDB->getGraphDB(); + + std::optional FromH; + if (Error E = FromDB.load(Node).moveInto(FromH)) + return std::move(E); + if (!FromH) + return createStringError(errc::invalid_argument, "node doesn't exist"); + + for (ObjectID Ref : FromDB.getObjectRefs(*FromH)) { + std::optional ID; + if (Error E = upstreamNode(Ref).moveInto(ID)) + return std::move(E); + } + + return importNode(Node, FromDB, ToDB); +} + +Expected CASWrapper::downstreamNode(ObjectID Node) { + OnDiskGraphDB &FromDB = UpstreamDB->getGraphDB(); + OnDiskGraphDB &ToDB = DB->getGraphDB(); + return importNode(Node, FromDB, ToDB); +} + +Error CASWrapper::upstreamKey(ArrayRef Key, ObjectID Value) { + if (!UpstreamDB) + return Error::success(); + Expected UpstreamVal = upstreamNode(Value); + if (!UpstreamVal) + return UpstreamVal.takeError(); + Expected PutValue = UpstreamDB->KVPut(Key, *UpstreamVal); + if (!PutValue) + return PutValue.takeError(); + assert(*PutValue == *UpstreamVal); + return Error::success(); +} + +Expected> +CASWrapper::downstreamKey(ArrayRef Key) { + if (!UpstreamDB) + return std::nullopt; + std::optional UpstreamValue; + if (Error E = UpstreamDB->KVGet(Key).moveInto(UpstreamValue)) + return std::move(E); + if (!UpstreamValue) + return std::nullopt; + + ObjectID Value = DB->getGraphDB().getReference( + UpstreamDB->getGraphDB().getDigest(*UpstreamValue)); + Expected PutValue = DB->KVPut(Key, Value); + if (!PutValue) + return PutValue.takeError(); + assert(*PutValue == Value); + return PutValue; +} + +llcas_cas_t llcas_cas_create(llcas_cas_options_t c_opts, char **error) { + auto &Opts = *unwrap(c_opts); + Expected> DB = UnifiedOnDiskCache::open( + Opts.OnDiskPath, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), sizeof(HashType)); + if (!DB) + return reportError(DB.takeError(), error); + + std::unique_ptr UpstreamDB; + if (!Opts.UpstreamPath.empty()) { + if (Error E = UnifiedOnDiskCache::open( + Opts.UpstreamPath, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), sizeof(HashType)) + .moveInto(UpstreamDB)) + return reportError(std::move(E), error); + } + + return wrap(new CASWrapper{Opts.FirstPrefix, Opts.SecondPrefix, + Opts.SimulateMissingObjects, Opts.Logging, + std::move(*DB), std::move(UpstreamDB)}); +} + +void llcas_cas_dispose(llcas_cas_t c_cas) { delete unwrap(c_cas); } + +void llcas_cas_options_set_client_version(llcas_cas_options_t, unsigned major, + unsigned minor) { + // Ignore for now. +} + +char *llcas_cas_get_hash_schema_name(llcas_cas_t) { + // Using same name as builtin CAS so that it's interchangeable for testing + // purposes. + return copyNewMallocString("llvm.cas.builtin.v2[BLAKE3]"); +} + +unsigned llcas_digest_parse(llcas_cas_t c_cas, const char *printed_digest, + uint8_t *bytes, size_t bytes_size, char **error) { + auto &Wrapper = *unwrap(c_cas); + if (bytes_size < sizeof(HashType)) + return sizeof(HashType); + + StringRef PrintedDigest = printed_digest; + bool Consumed = PrintedDigest.consume_front(Wrapper.FirstPrefix); + assert(Consumed); + (void)Consumed; + Consumed = PrintedDigest.consume_front(Wrapper.SecondPrefix); + assert(Consumed); + (void)Consumed; + + Expected Digest = BuiltinCASContext::parseID(PrintedDigest); + if (!Digest) + return reportError(Digest.takeError(), error, 0); + std::uninitialized_copy(Digest->begin(), Digest->end(), bytes); + return Digest->size(); +} + +bool llcas_digest_print(llcas_cas_t c_cas, llcas_digest_t c_digest, + char **printed_id, char **error) { + auto &Wrapper = *unwrap(c_cas); + SmallString<74> PrintDigest; + raw_svector_ostream OS(PrintDigest); + // Include these for testing purposes. + OS << Wrapper.FirstPrefix << Wrapper.SecondPrefix; + BuiltinCASContext::printID(ArrayRef(c_digest.data, c_digest.size), OS); + *printed_id = copyNewMallocString(PrintDigest); + return false; +} + +bool llcas_cas_get_objectid(llcas_cas_t c_cas, llcas_digest_t c_digest, + llcas_objectid_t *c_id_p, char **error) { + auto &CAS = unwrap(c_cas)->DB->getGraphDB(); + ObjectID ID = CAS.getReference(ArrayRef(c_digest.data, c_digest.size)); + *c_id_p = llcas_objectid_t{ID.getOpaqueData()}; + return false; +} + +llcas_digest_t llcas_objectid_get_digest(llcas_cas_t c_cas, + llcas_objectid_t c_id) { + auto &CAS = unwrap(c_cas)->DB->getGraphDB(); + ObjectID ID = ObjectID::fromOpaqueData(c_id.opaque); + ArrayRef Digest = CAS.getDigest(ID); + return llcas_digest_t{Digest.data(), Digest.size()}; +} + +llcas_lookup_result_t llcas_cas_contains_object(llcas_cas_t c_cas, + llcas_objectid_t c_id, + bool globally, char **error) { + ObjectID ID = ObjectID::fromOpaqueData(c_id.opaque); + return unwrap(c_cas)->containsObject(ID, globally) + ? LLCAS_LOOKUP_RESULT_SUCCESS + : LLCAS_LOOKUP_RESULT_NOTFOUND; +} + +llcas_lookup_result_t llcas_cas_load_object(llcas_cas_t c_cas, + llcas_objectid_t c_id, + llcas_loaded_object_t *c_obj_p, + char **error) { + ObjectID ID = ObjectID::fromOpaqueData(c_id.opaque); + Expected> ObjOpt = + unwrap(c_cas)->loadObject(ID); + if (!ObjOpt) + return reportError(ObjOpt.takeError(), error, LLCAS_LOOKUP_RESULT_ERROR); + if (!*ObjOpt) + return LLCAS_LOOKUP_RESULT_NOTFOUND; + + ondisk::ObjectHandle Obj = **ObjOpt; + *c_obj_p = llcas_loaded_object_t{Obj.getOpaqueData()}; + return LLCAS_LOOKUP_RESULT_SUCCESS; +} + +void llcas_cas_load_object_async(llcas_cas_t c_cas, llcas_objectid_t c_id, + void *ctx_cb, llcas_cas_load_object_cb cb) { + std::string PrintedDigest; + { + llcas_digest_t c_digest = llcas_objectid_get_digest(c_cas, c_id); + char *printed_id; + char *c_err; + bool failed = llcas_digest_print(c_cas, c_digest, &printed_id, &c_err); + if (failed) + report_fatal_error(Twine("digest printing failed: ") + c_err); + PrintedDigest = printed_id; + llcas_string_dispose(printed_id); + } + + auto passObject = [ctx_cb, + cb](Expected> Obj) { + if (!Obj) { + cb(ctx_cb, LLCAS_LOOKUP_RESULT_ERROR, llcas_loaded_object_t(), + copyNewMallocString(toString(Obj.takeError()))); + } else if (!*Obj) { + cb(ctx_cb, LLCAS_LOOKUP_RESULT_NOTFOUND, llcas_loaded_object_t(), + nullptr); + } else { + cb(ctx_cb, LLCAS_LOOKUP_RESULT_SUCCESS, + llcas_loaded_object_t{(*Obj)->getOpaqueData()}, nullptr); + } + }; + + auto &CAS = unwrap(c_cas)->DB->getGraphDB(); + ObjectID ID = ObjectID::fromOpaqueData(c_id.opaque); + if (CAS.containsObject(ID)) { + unwrap(c_cas)->syncErrs([&](raw_ostream &OS) { + OS << "load_object_async existing: " << PrintedDigest << '\n'; + }); + return passObject(unwrap(c_cas)->loadObject(ID)); + } + + if (!unwrap(c_cas)->UpstreamDB) + return passObject(std::nullopt); + + // Try "downloading" the node from upstream. + + unwrap(c_cas)->syncErrs([&](raw_ostream &OS) { + OS << "load_object_async downstream begin: " << PrintedDigest << '\n'; + }); + unwrap(c_cas)->Pool.async([=] { + // Wait a bit for the caller to proceed. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + auto &Wrap = *unwrap(c_cas); + Wrap.syncErrs([&](raw_ostream &OS) { + OS << "load_object_async downstream end: " << PrintedDigest << '\n'; + }); + if (Wrap.SimulateMissingObjects) + return passObject(std::nullopt); + passObject(Wrap.loadObject(ID)); + }); +} + +bool llcas_cas_store_object(llcas_cas_t c_cas, llcas_data_t c_data, + const llcas_objectid_t *c_refs, size_t c_refs_count, + llcas_objectid_t *c_id_p, char **error) { + auto &CAS = unwrap(c_cas)->DB->getGraphDB(); + SmallVector Refs; + Refs.reserve(c_refs_count); + for (unsigned I = 0; I != c_refs_count; ++I) { + Refs.push_back(ObjectID::fromOpaqueData(c_refs[I].opaque)); + } + ArrayRef Data((const char *)c_data.data, c_data.size); + + SmallVector, 8> RefHashes; + RefHashes.reserve(c_refs_count); + for (ObjectID Ref : Refs) + RefHashes.push_back(CAS.getDigest(Ref)); + HashType Digest = BuiltinObjectHasher::hashObject(RefHashes, Data); + ObjectID StoredID = CAS.getReference(Digest); + + if (Error E = CAS.store(StoredID, Refs, Data)) + return reportError(std::move(E), error, true); + *c_id_p = llcas_objectid_t{StoredID.getOpaqueData()}; + return false; +} + +llcas_data_t llcas_loaded_object_get_data(llcas_cas_t c_cas, + llcas_loaded_object_t c_obj) { + auto &CAS = unwrap(c_cas)->DB->getGraphDB(); + ondisk::ObjectHandle Obj = ondisk::ObjectHandle::fromOpaqueData(c_obj.opaque); + auto Data = CAS.getObjectData(Obj); + return llcas_data_t{Data.data(), Data.size()}; +} + +llcas_object_refs_t llcas_loaded_object_get_refs(llcas_cas_t c_cas, + llcas_loaded_object_t c_obj) { + auto &CAS = unwrap(c_cas)->DB->getGraphDB(); + ondisk::ObjectHandle Obj = ondisk::ObjectHandle::fromOpaqueData(c_obj.opaque); + auto Refs = CAS.getObjectRefs(Obj); + return llcas_object_refs_t{Refs.begin().getOpaqueData(), + Refs.end().getOpaqueData()}; +} + +size_t llcas_object_refs_get_count(llcas_cas_t c_cas, + llcas_object_refs_t c_refs) { + auto B = object_refs_iterator::fromOpaqueData(c_refs.opaque_b); + auto E = object_refs_iterator::fromOpaqueData(c_refs.opaque_e); + return E - B; +} + +llcas_objectid_t llcas_object_refs_get_id(llcas_cas_t c_cas, + llcas_object_refs_t c_refs, + size_t index) { + auto RefsI = object_refs_iterator::fromOpaqueData(c_refs.opaque_b); + ObjectID Ref = *(RefsI + index); + return llcas_objectid_t{Ref.getOpaqueData()}; +} + +llcas_lookup_result_t +llcas_actioncache_get_for_digest(llcas_cas_t c_cas, llcas_digest_t c_key, + llcas_objectid_t *p_value, bool globally, + char **error) { + auto &Wrap = *unwrap(c_cas); + auto &DB = *Wrap.DB; + ArrayRef Key(c_key.data, c_key.size); + std::optional Value; + if (Error E = DB.KVGet(Key).moveInto(Value)) + return reportError(std::move(E), error, LLCAS_LOOKUP_RESULT_ERROR); + if (!Value) { + if (!globally) + return LLCAS_LOOKUP_RESULT_NOTFOUND; + + if (Error E = Wrap.downstreamKey(Key).moveInto(Value)) + return reportError(std::move(E), error, LLCAS_LOOKUP_RESULT_ERROR); + if (!Value) + return LLCAS_LOOKUP_RESULT_NOTFOUND; + } + *p_value = llcas_objectid_t{Value->getOpaqueData()}; + return LLCAS_LOOKUP_RESULT_SUCCESS; +} + +void llcas_actioncache_get_for_digest_async(llcas_cas_t c_cas, + llcas_digest_t c_key, bool globally, + void *ctx_cb, + llcas_actioncache_get_cb cb) { + ArrayRef Key(c_key.data, c_key.size); + SmallVector KeyBuf(Key); + + unwrap(c_cas)->Pool.async([=] { + llcas_objectid_t c_value; + char *c_err; + llcas_lookup_result_t result = llcas_actioncache_get_for_digest( + c_cas, llcas_digest_t{KeyBuf.data(), KeyBuf.size()}, &c_value, globally, + &c_err); + cb(ctx_cb, result, c_value, c_err); + }); +} + +bool llcas_actioncache_put_for_digest(llcas_cas_t c_cas, llcas_digest_t c_key, + llcas_objectid_t c_value, bool globally, + char **error) { + auto &Wrap = *unwrap(c_cas); + auto &DB = *Wrap.DB; + ObjectID Value = ObjectID::fromOpaqueData(c_value.opaque); + ArrayRef Key(c_key.data, c_key.size); + Expected Ret = DB.KVPut(Key, Value); + if (!Ret) + return reportError(Ret.takeError(), error, true); + if (*Ret != Value) + return reportError( + createStringError(errc::invalid_argument, "cache poisoned"), error, + true); + + if (globally) { + if (Error E = Wrap.upstreamKey(Key, Value)) + return reportError(std::move(E), error, true); + } + + return false; +} + +void llcas_actioncache_put_for_digest_async(llcas_cas_t c_cas, + llcas_digest_t c_key, + llcas_objectid_t c_value, + bool globally, void *ctx_cb, + llcas_actioncache_put_cb cb) { + ArrayRef Key(c_key.data, c_key.size); + SmallVector KeyBuf(Key); + + unwrap(c_cas)->Pool.async([=] { + char *c_err; + bool failed = llcas_actioncache_put_for_digest( + c_cas, llcas_digest_t{KeyBuf.data(), KeyBuf.size()}, c_value, globally, + &c_err); + cb(ctx_cb, failed, c_err); + }); +} diff --git a/llvm/tools/libCASPluginTest/libCASPluginTest.exports b/llvm/tools/libCASPluginTest/libCASPluginTest.exports new file mode 100644 index 00000000000000..8fda2c5559c92f --- /dev/null +++ b/llvm/tools/libCASPluginTest/libCASPluginTest.exports @@ -0,0 +1,26 @@ +llcas_actioncache_get_for_digest +llcas_actioncache_get_for_digest_async +llcas_actioncache_put_for_digest +llcas_actioncache_put_for_digest_async +llcas_cas_contains_object +llcas_cas_create +llcas_cas_dispose +llcas_cas_get_hash_schema_name +llcas_cas_get_objectid +llcas_cas_load_object +llcas_cas_load_object_async +llcas_cas_options_create +llcas_cas_options_dispose +llcas_cas_options_set_client_version +llcas_cas_options_set_ondisk_path +llcas_cas_options_set_option +llcas_cas_store_object +llcas_digest_parse +llcas_digest_print +llcas_get_plugin_version +llcas_loaded_object_get_data +llcas_loaded_object_get_refs +llcas_object_refs_get_count +llcas_object_refs_get_id +llcas_objectid_get_digest +llcas_string_dispose diff --git a/llvm/tools/llvm-cas/CMakeLists.txt b/llvm/tools/llvm-cas/CMakeLists.txt new file mode 100644 index 00000000000000..6093a906b503a7 --- /dev/null +++ b/llvm/tools/llvm-cas/CMakeLists.txt @@ -0,0 +1,8 @@ +set(LLVM_LINK_COMPONENTS + Support + CAS + ) + +add_llvm_tool(llvm-cas + llvm-cas.cpp + ) diff --git a/llvm/tools/llvm-cas/llvm-cas.cpp b/llvm/tools/llvm-cas/llvm-cas.cpp new file mode 100644 index 00000000000000..f5341a89e1a76e --- /dev/null +++ b/llvm/tools/llvm-cas/llvm-cas.cpp @@ -0,0 +1,453 @@ +//===- llvm-cas.cpp - CAS tool --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "llvm/CAS/CASRegistry.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/CAS/TreeSchema.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +using namespace llvm; +using namespace llvm::cas; + +static cl::opt AllTrees( + "all-trees", + cl::desc("Print all trees, not just empty ones, for ls-tree-recursive")); +static cl::list Inputs(cl::Positional, cl::desc("Input object")); + +static int dump(ObjectStore &CAS); +static int listTree(ObjectStore &CAS, const CASID &ID); +static int listTreeRecursively(ObjectStore &CAS, const CASID &ID); +static int listObjectReferences(ObjectStore &CAS, const CASID &ID); +static int catBlob(ObjectStore &CAS, const CASID &ID); +static int catNodeData(ObjectStore &CAS, const CASID &ID); +static int printKind(ObjectStore &CAS, const CASID &ID); +static int makeBlob(ObjectStore &CAS, StringRef DataPath); +static int makeNode(ObjectStore &CAS, ArrayRef References, + StringRef DataPath); +static int diffGraphs(ObjectStore &CAS, const CASID &LHS, const CASID &RHS); +static int traverseGraph(ObjectStore &CAS, const CASID &ID); +static int import(ObjectStore &CAS, ObjectStore &UpstreamCAS, + ArrayRef Objects); +static int putCacheKey(ObjectStore &CAS, ActionCache &AC, + ArrayRef Objects); +static int getCacheResult(ObjectStore &CAS, ActionCache &AC, const CASID &ID); + +int main(int Argc, char **Argv) { + InitLLVM X(Argc, Argv); + + cl::opt CASPath("cas", cl::desc("Path to CAS on disk."), + cl::value_desc("path")); + cl::opt CASPluginPath("fcas-plugin-path", + cl::desc("Path to plugin CAS library"), + cl::value_desc("path")); + cl::list CASPluginOpts("fcas-plugin-option", + cl::desc("Plugin CAS Options")); + cl::opt UpstreamCASPath( + "upstream-cas", cl::desc("Path to another CAS."), cl::value_desc("path")); + cl::opt DataPath("data", + cl::desc("Path to data or '-' for stdin."), + cl::value_desc("path")); + + enum CommandKind { + Invalid, + Dump, + PrintKind, + CatBlob, + CatNodeData, + DiffGraphs, + TraverseGraph, + MakeBlob, + MakeNode, + ListTree, + ListTreeRecursive, + ListObjectReferences, + Import, + PutCacheKey, + GetCacheResult, + }; + cl::opt Command( + cl::desc("choose command action:"), + cl::values( + clEnumValN(Dump, "dump", "dump internal contents"), + clEnumValN(PrintKind, "print-kind", "print kind"), + clEnumValN(CatBlob, "cat-blob", "cat blob"), + clEnumValN(CatNodeData, "cat-node-data", "cat node data"), + clEnumValN(DiffGraphs, "diff-graphs", "diff graphs"), + clEnumValN(TraverseGraph, "traverse-graph", "traverse graph"), + clEnumValN(MakeBlob, "make-blob", "make blob"), + clEnumValN(MakeNode, "make-node", "make node"), + clEnumValN(ListTree, "ls-tree", "list tree"), + clEnumValN(ListTreeRecursive, "ls-tree-recursive", + "list tree recursive"), + clEnumValN(ListObjectReferences, "ls-node-refs", "list node refs"), + clEnumValN(Import, "import", "import objects from another CAS"), + clEnumValN(PutCacheKey, "put-cache-key", + "set a value for a cache key"), + clEnumValN(GetCacheResult, "get-cache-result", + "get the result value from a cache key")), + cl::init(CommandKind::Invalid)); + + cl::ParseCommandLineOptions(Argc, Argv, "llvm-cas CAS tool\n"); + ExitOnError ExitOnErr("llvm-cas: "); + + if (Command == CommandKind::Invalid) + ExitOnErr(createStringError(inconvertibleErrorCode(), + "no command action is specified")); + + // FIXME: Consider creating an in-memory CAS. + if (CASPath.empty()) + ExitOnErr( + createStringError(inconvertibleErrorCode(), "missing --cas=")); + + std::shared_ptr CAS; + std::shared_ptr AC; + if (isRegisteredCASIdentifier(CASPath)) + std::tie(CAS, AC) = ExitOnErr(createCASFromIdentifier(CASPath)); + else + std::tie(CAS, AC) = ExitOnErr(createOnDiskUnifiedCASDatabases(CASPath)); + + std::shared_ptr UpstreamCAS; + if (!UpstreamCASPath.empty()) + UpstreamCAS = + std::move(ExitOnErr(createCASFromIdentifier(UpstreamCASPath)).first); + + if (Command == Dump) + return dump(*CAS); + + if (Command == MakeBlob) + return makeBlob(*CAS, DataPath); + + if (Command == MakeNode) + return makeNode(*CAS, Inputs, DataPath); + + if (Command == DiffGraphs) { + ExitOnError CommandErr("llvm-cas: diff-graphs"); + + if (Inputs.size() != 2) + CommandErr( + createStringError(inconvertibleErrorCode(), "expected 2 objects")); + + CASID LHS = ExitOnErr(CAS->parseID(Inputs[0])); + CASID RHS = ExitOnErr(CAS->parseID(Inputs[1])); + return diffGraphs(*CAS, LHS, RHS); + } + + if (Inputs.empty()) + ExitOnErr(createStringError(inconvertibleErrorCode(), + "missing to operate on")); + + if (Command == Import) { + if (!UpstreamCAS) + ExitOnErr(createStringError(inconvertibleErrorCode(), + "missing '-upstream-cas'")); + return import(*CAS, *UpstreamCAS, Inputs); + } + + if (Command == PutCacheKey || Command == GetCacheResult) { + if (!AC) + ExitOnErr(createStringError(inconvertibleErrorCode(), + "no action-cache available")); + } + + if (Command == PutCacheKey) + return putCacheKey(*CAS, *AC, Inputs); + + // Remaining commands need exactly one CAS object. + if (Inputs.size() > 1) + ExitOnErr(createStringError(inconvertibleErrorCode(), + "too many s, expected 1")); + CASID ID = ExitOnErr(CAS->parseID(Inputs.front())); + + if (Command == GetCacheResult) + return getCacheResult(*CAS, *AC, ID); + + if (Command == TraverseGraph) + return traverseGraph(*CAS, ID); + + if (Command == ListTree) + return listTree(*CAS, ID); + + if (Command == ListTreeRecursive) + return listTreeRecursively(*CAS, ID); + + if (Command == ListObjectReferences) + return listObjectReferences(*CAS, ID); + + if (Command == CatNodeData) + return catNodeData(*CAS, ID); + + if (Command == PrintKind) + return printKind(*CAS, ID); + + assert(Command == CatBlob); + return catBlob(*CAS, ID); +} + +int listTree(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: ls-tree: "); + + TreeSchema Schema(CAS); + ObjectProxy TreeN = ExitOnErr(CAS.getProxy(ID)); + TreeProxy Tree = ExitOnErr(Schema.load(TreeN)); + ExitOnErr(Tree.forEachEntry([&](const NamedTreeEntry &Entry) { + Entry.print(llvm::outs(), CAS); + return Error::success(); + })); + + return 0; +} + +int listTreeRecursively(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: ls-tree-recursively: "); + TreeSchema Schema(CAS); + ObjectProxy TreeN = ExitOnErr(CAS.getProxy(ID)); + ExitOnErr(Schema.walkFileTreeRecursively( + CAS, TreeN.getRef(), + [&](const NamedTreeEntry &Entry, std::optional Tree) -> Error { + if (Entry.getKind() != TreeEntry::Tree) { + Entry.print(llvm::outs(), CAS); + return Error::success(); + } + if (Tree->empty() || AllTrees) + Entry.print(llvm::outs(), CAS); + return Error::success(); + })); + + return 0; +} + +int catBlob(ObjectStore &CAS, const CASID &ID) { return catNodeData(CAS, ID); } + +static Expected> openBuffer(StringRef DataPath) { + if (DataPath.empty()) + return createStringError(inconvertibleErrorCode(), "--data missing"); + return errorOrToExpected(DataPath == "-" + ? llvm::MemoryBuffer::getSTDIN() + : llvm::MemoryBuffer::getFile(DataPath)); +} + +int dump(ObjectStore &CAS) { + ExitOnError ExitOnErr("llvm-cas: dump: "); + CAS.print(llvm::outs()); + return 0; +} + +int makeBlob(ObjectStore &CAS, StringRef DataPath) { + ExitOnError ExitOnErr("llvm-cas: make-blob: "); + std::unique_ptr Buffer = ExitOnErr(openBuffer(DataPath)); + + ObjectProxy Blob = + ExitOnErr(CAS.createProxy(std::nullopt, Buffer->getBuffer())); + llvm::outs() << Blob.getID() << "\n"; + return 0; +} + +int catNodeData(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: cat-node-data: "); + llvm::outs() << ExitOnErr(CAS.getProxy(ID)).getData(); + return 0; +} + +static StringRef getKindString(ObjectStore &CAS, ObjectProxy Object) { + TreeSchema Schema(CAS); + if (Schema.isNode(Object)) + return "tree"; + return "object"; +} + +int printKind(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: print-kind: "); + ObjectProxy Object = ExitOnErr(CAS.getProxy(ID)); + + llvm::outs() << getKindString(CAS, Object) << "\n"; + return 0; +} + +int listObjectReferences(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: ls-node-refs: "); + + ObjectProxy Object = ExitOnErr(CAS.getProxy(ID)); + ExitOnErr(Object.forEachReference([&](ObjectRef Ref) -> Error { + llvm::outs() << CAS.getID(Ref) << "\n"; + return Error::success(); + })); + + return 0; +} + +static int makeNode(ObjectStore &CAS, ArrayRef Objects, + StringRef DataPath) { + std::unique_ptr Data = + ExitOnError("llvm-cas: make-node: data: ")(openBuffer(DataPath)); + + SmallVector IDs; + for (StringRef Object : Objects) { + ExitOnError ObjectErr("llvm-cas: make-node: ref: "); + std::optional ID = + CAS.getReference(ObjectErr(CAS.parseID(Object))); + if (!ID) + ObjectErr(createStringError(inconvertibleErrorCode(), + "unknown object '" + Object + "'")); + IDs.push_back(*ID); + } + + ExitOnError ExitOnErr("llvm-cas: make-node: "); + ObjectProxy Object = ExitOnErr(CAS.createProxy(IDs, Data->getBuffer())); + llvm::outs() << Object.getID() << "\n"; + return 0; +} + +namespace { +struct GraphInfo { + SmallVector PostOrder; + DenseSet Seen; +}; +} // namespace + +static GraphInfo traverseObjectGraph(ObjectStore &CAS, const CASID &TopLevel) { + ExitOnError ExitOnErr("llvm-cas: traverse-node-graph: "); + GraphInfo Info; + + SmallVector> Worklist; + auto push = [&](CASID ID) { + if (Info.Seen.insert(ID).second) + Worklist.push_back({ID, false}); + }; + push(TopLevel); + while (!Worklist.empty()) { + if (Worklist.back().second) { + Info.PostOrder.push_back(Worklist.pop_back_val().first); + continue; + } + Worklist.back().second = true; + CASID ID = Worklist.back().first; + ObjectProxy Object = ExitOnErr(CAS.getProxy(ID)); + + TreeSchema Schema(CAS); + if (Schema.isNode(Object)) { + TreeProxy Tree = ExitOnErr(Schema.load(Object)); + ExitOnErr(Tree.forEachEntry([&](const NamedTreeEntry &Entry) { + push(CAS.getID(Entry.getRef())); + return Error::success(); + })); + continue; + } + + ExitOnErr(Object.forEachReference([&](ObjectRef Ref) { + push(CAS.getID(Ref)); + return Error::success(); + })); + } + + return Info; +} + +static void printDiffs(ObjectStore &CAS, const GraphInfo &Baseline, + const GraphInfo &New, StringRef NewName) { + ExitOnError ExitOnErr("llvm-cas: diff-graphs: "); + + for (cas::CASID ID : New.PostOrder) { + if (Baseline.Seen.count(ID)) + continue; + + StringRef KindString; + ObjectProxy Object = ExitOnErr(CAS.getProxy(ID)); + KindString = getKindString(CAS, Object); + + outs() << llvm::formatv("{0}{1,-4} {2}\n", NewName, KindString, ID); + } +} + +int diffGraphs(ObjectStore &CAS, const CASID &LHS, const CASID &RHS) { + if (LHS == RHS) + return 0; + + ExitOnError ExitOnErr("llvm-cas: diff-graphs: "); + GraphInfo LHSInfo = traverseObjectGraph(CAS, LHS); + GraphInfo RHSInfo = traverseObjectGraph(CAS, RHS); + + printDiffs(CAS, RHSInfo, LHSInfo, "- "); + printDiffs(CAS, LHSInfo, RHSInfo, "+ "); + return 0; +} + +int traverseGraph(ObjectStore &CAS, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: traverse-graph: "); + GraphInfo Info = traverseObjectGraph(CAS, ID); + printDiffs(CAS, GraphInfo{}, Info, ""); + return 0; +} + +static ObjectRef importNode(ObjectStore &CAS, ObjectStore &UpstreamCAS, + const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: import: "); + + std::optional PrimaryRef = CAS.getReference(ID); + if (PrimaryRef) + return *PrimaryRef; // object is present. + + ObjectProxy UpstreamObj = ExitOnErr(UpstreamCAS.getProxy(ID)); + SmallVector Refs; + ExitOnErr(UpstreamObj.forEachReference([&](ObjectRef UpstreamRef) -> Error { + ObjectRef Ref = + importNode(CAS, UpstreamCAS, UpstreamCAS.getID(UpstreamRef)); + Refs.push_back(Ref); + return Error::success(); + })); + return ExitOnErr(CAS.storeFromString(Refs, UpstreamObj.getData())); +} + +static int import(ObjectStore &CAS, ObjectStore &UpstreamCAS, + ArrayRef Objects) { + ExitOnError ExitOnErr("llvm-cas: import: "); + + for (StringRef Object : Objects) { + CASID ID = ExitOnErr(CAS.parseID(Object)); + importNode(CAS, UpstreamCAS, ID); + } + return 0; +} + +static int putCacheKey(ObjectStore &CAS, ActionCache &AC, + ArrayRef Objects) { + ExitOnError ExitOnErr("llvm-cas: put-cache-key: "); + + if (Objects.size() % 2 != 0) + ExitOnErr(createStringError(inconvertibleErrorCode(), + "expected pairs of inputs")); + while (!Objects.empty()) { + CASID Key = ExitOnErr(CAS.parseID(Objects[0])); + CASID Result = ExitOnErr(CAS.parseID(Objects[1])); + Objects = Objects.drop_front(2); + ExitOnErr(AC.put(Key, Result)); + } + return 0; +} + +static int getCacheResult(ObjectStore &CAS, ActionCache &AC, const CASID &ID) { + ExitOnError ExitOnErr("llvm-cas: get-cache-result: "); + + auto Result = ExitOnErr(AC.get(ID)); + if (!Result) { + outs() << "result not found\n"; + return 1; + } + outs() << *Result << "\n"; + return 0; +} diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt index 745e4d9fb74a4a..b0077d5b54a3ee 100644 --- a/llvm/unittests/ADT/CMakeLists.txt +++ b/llvm/unittests/ADT/CMakeLists.txt @@ -86,6 +86,7 @@ add_llvm_unittest(ADTTests StringSetTest.cpp StringSwitchTest.cpp TinyPtrVectorTest.cpp + TrieRawHashMapTest.cpp TwineTest.cpp TypeSwitchTest.cpp TypeTraitsTest.cpp diff --git a/llvm/unittests/ADT/TrieRawHashMapTest.cpp b/llvm/unittests/ADT/TrieRawHashMapTest.cpp new file mode 100644 index 00000000000000..24be6c4748ea6f --- /dev/null +++ b/llvm/unittests/ADT/TrieRawHashMapTest.cpp @@ -0,0 +1,344 @@ +//===- TrieRawHashMapTest.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/TrieRawHashMap.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/bit.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/SHA1.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace llvm { +class TrieRawHashMapTestHelper { +public: + TrieRawHashMapTestHelper() = default; + + void setTrie(ThreadSafeTrieRawHashMapBase *T) { Trie = T; } + + ThreadSafeTrieRawHashMapBase::PointerBase getRoot() const { + return Trie->getRoot(); + } + unsigned getStartBit(ThreadSafeTrieRawHashMapBase::PointerBase P) const { + return Trie->getStartBit(P); + } + unsigned getNumBits(ThreadSafeTrieRawHashMapBase::PointerBase P) const { + return Trie->getNumBits(P); + } + unsigned getNumSlotUsed(ThreadSafeTrieRawHashMapBase::PointerBase P) const { + return Trie->getNumSlotUsed(P); + } + unsigned getNumTries() const { return Trie->getNumTries(); } + std::string + getTriePrefixAsString(ThreadSafeTrieRawHashMapBase::PointerBase P) const { + return Trie->getTriePrefixAsString(P); + } + ThreadSafeTrieRawHashMapBase::PointerBase + getNextTrie(ThreadSafeTrieRawHashMapBase::PointerBase P) const { + return Trie->getNextTrie(P); + } + +private: + ThreadSafeTrieRawHashMapBase *Trie = nullptr; +}; +} // namespace llvm + +namespace { +template +class SimpleTrieHashMapTest : public TrieRawHashMapTestHelper, + public ::testing::Test { +public: + using NumType = DataType; + using HashType = std::array; + using TrieType = ThreadSafeTrieRawHashMap; + + TrieType &createTrie(size_t RootBits, size_t SubtrieBits) { + auto &Ret = Trie.emplace(RootBits, SubtrieBits); + TrieRawHashMapTestHelper::setTrie(&Ret); + return Ret; + } + + void destroyTrie() { Trie.reset(); } + + ~SimpleTrieHashMapTest() { + if (Trie) + Trie.reset(); + } + + // Use the number itself as hash to test the pathological case. + static HashType hash(uint64_t Num) { + uint64_t HashN = + llvm::support::endian::byte_swap(Num, llvm::endianness::big); + HashType Hash; + memcpy(&Hash[0], &HashN, sizeof(HashType)); + return Hash; + }; + +private: + std::optional Trie; +}; + +using SmallNodeTrieTest = SimpleTrieHashMapTest; + +TEST_F(SmallNodeTrieTest, TrieAllocation) { + NumType Numbers[] = { + 0x0, std::numeric_limits::max(), 0x1, 0x2, + 0x3, std::numeric_limits::max() - 1u, + }; + + unsigned ExpectedTries[] = { + 1, // Allocate Root. + 1, // Both on the root. + 64, // 0 and 1 sinks all the way down. + 64, // no new allocation needed. + 65, // need a new node between 2 and 3. + 65 + 63, // 63 new allocation to sink two big numbers all the way. + }; + + const char *ExpectedPrefix[] = { + "", // Root. + "", // Root. + "00000000000000[0000000]", + "00000000000000[0000000]", + "00000000000000[0000001]", + "ffffffffffffff[1111111]", + }; + + // Use root and subtrie sizes of 1 so this gets sunk quite deep. + auto &Trie = createTrie(/*RootBits=*/1, /*SubtrieBits=*/1); + + for (unsigned I = 0; I < 6; ++I) { + // Lookup first to exercise hint code for deep tries. + TrieType::pointer Lookup = Trie.find(hash(Numbers[I])); + EXPECT_FALSE(Lookup); + + Trie.insert(Lookup, TrieType::value_type(hash(Numbers[I]), Numbers[I])); + EXPECT_EQ(getNumTries(), ExpectedTries[I]); + EXPECT_EQ(getTriePrefixAsString(getNextTrie(getRoot())), ExpectedPrefix[I]); + } +} + +TEST_F(SmallNodeTrieTest, TrieStructure) { + NumType Numbers[] = { + // Three numbers that will nest deeply to test (1) sinking subtries and + // (2) deep, non-trivial hints. + std::numeric_limits::max(), + std::numeric_limits::max() - 2u, + std::numeric_limits::max() - 3u, + // One number to stay at the top-level. + 0x37, + }; + + // Use root and subtrie sizes of 1 so this gets sunk quite deep. + auto &Trie = createTrie(/*RootBits=*/1, /*SubtrieBits=*/1); + + for (NumType N : Numbers) { + // Lookup first to exercise hint code for deep tries. + TrieType::pointer Lookup = Trie.find(hash(N)); + EXPECT_FALSE(Lookup); + + Trie.insert(Lookup, TrieType::value_type(hash(N), N)); + } + for (NumType N : Numbers) { + TrieType::pointer Lookup = Trie.find(hash(N)); + EXPECT_TRUE(Lookup); + if (!Lookup) + continue; + EXPECT_EQ(hash(N), Lookup->Hash); + EXPECT_EQ(N, Lookup->Data); + + // Confirm a subsequent insertion fails to overwrite by trying to insert a + // bad value. + auto Result = Trie.insert(Lookup, TrieType::value_type(hash(N), N - 1)); + EXPECT_EQ(N, Result->Data); + } + + // Check the trie so we can confirm the structure is correct. Each subtrie + // should have 2 slots. The root's index=0 should have the content for + // 0x37 directly, and index=1 should be a linked-list of subtries, finally + // ending with content for (max-2) and (max-3). + // + // Note: This structure is not exhaustive (too expensive to update tests), + // but it does test that the dump format is somewhat readable and that the + // basic structure is correct. + // + // Note: This test requires that the trie reads bytes starting from index 0 + // of the array of uint8_t, and then reads each byte's bits from high to low. + + // Check the Trie. + // We should allocated a total of 64 SubTries for 64 bit hash. + ASSERT_EQ(getNumTries(), 64u); + // Check the root trie. Two slots and both are used. + ASSERT_EQ(getNumSlotUsed(getRoot()), 2u); + // Check last subtrie. + // Last allocated trie is the next node in the allocation chain. + auto LastAlloctedSubTrie = getNextTrie(getRoot()); + ASSERT_EQ(getTriePrefixAsString(LastAlloctedSubTrie), + "ffffffffffffff[1111110]"); + ASSERT_EQ(getStartBit(LastAlloctedSubTrie), 63u); + ASSERT_EQ(getNumBits(LastAlloctedSubTrie), 1u); + ASSERT_EQ(getNumSlotUsed(LastAlloctedSubTrie), 2u); +} + +TEST_F(SmallNodeTrieTest, TrieStructureSmallFinalSubtrie) { + NumType Numbers[] = { + // Three numbers that will nest deeply to test (1) sinking subtries and + // (2) deep, non-trivial hints. + std::numeric_limits::max(), + std::numeric_limits::max() - 2u, + std::numeric_limits::max() - 3u, + // One number to stay at the top-level. + 0x37, + }; + + // Use subtrie size of 5 to avoid hitting 64 evenly, making the final subtrie + // small. + auto &Trie = createTrie(/*RootBits=*/8, /*SubtrieBits=*/5); + + for (NumType N : Numbers) { + // Lookup first to exercise hint code for deep tries. + TrieType::pointer Lookup = Trie.find(hash(N)); + EXPECT_FALSE(Lookup); + + Trie.insert(Lookup, TrieType::value_type(hash(N), N)); + } + for (NumType N : Numbers) { + TrieType::pointer Lookup = Trie.find(hash(N)); + EXPECT_TRUE(Lookup); + if (!Lookup) + continue; + EXPECT_EQ(hash(N), Lookup->Hash); + EXPECT_EQ(N, Lookup->Data); + + // Confirm a subsequent insertion fails to overwrite by trying to insert a + // bad value. + auto Result = Trie.insert(Lookup, TrieType::value_type(hash(N), N - 1)); + EXPECT_EQ(N, Result->Data); + } + + // Check the trie so we can confirm the structure is correct. The root + // should have 2^8=256 slots, most subtries should have 2^5=32 slots, and the + // deepest subtrie should have 2^1=2 slots (since (64-8)mod(5)=1). + // should have 2 slots. The root's index=0 should have the content for + // 0x37 directly, and index=1 should be a linked-list of subtries, finally + // ending with content for (max-2) and (max-3). + // + // Note: This structure is not exhaustive (too expensive to update tests), + // but it does test that the dump format is somewhat readable and that the + // basic structure is correct. + // + // Note: This test requires that the trie reads bytes starting from index 0 + // of the array of uint8_t, and then reads each byte's bits from high to low. + + // Check the Trie. + // 64 bit hash = 8 + 5 * 11 + 1, so 1 root, 11 8bit subtrie and 1 last level + // subtrie, 13 total. + ASSERT_EQ(getNumTries(), 13u); + // Check the root trie. Two slots and both are used. + ASSERT_EQ(getNumSlotUsed(getRoot()), 2u); + // Check last subtrie. + // Last allocated trie is the next node in the allocation chain. + auto LastAlloctedSubTrie = getNextTrie(getRoot()); + ASSERT_EQ(getTriePrefixAsString(LastAlloctedSubTrie), + "ffffffffffffff[1111110]"); + ASSERT_EQ(getStartBit(LastAlloctedSubTrie), 63u); + ASSERT_EQ(getNumBits(LastAlloctedSubTrie), 1u); + ASSERT_EQ(getNumSlotUsed(LastAlloctedSubTrie), 2u); +} + +TEST_F(SmallNodeTrieTest, TrieDestructionLoop) { + // Test destroying large Trie. Make sure there is no recursion that can + // overflow the stack. + + // Limit the tries to 2 slots (1 bit) to generate subtries at a higher rate. + auto &Trie = createTrie(/*NumRootBits=*/1, /*NumSubtrieBits=*/1); + + // Fill them up. Pick a MaxN high enough to cause a stack overflow in debug + // builds. + static constexpr uint64_t MaxN = 100000; + for (uint64_t N = 0; N != MaxN; ++N) { + HashType Hash = hash(N); + Trie.insert(TrieType::pointer(), TrieType::value_type(Hash, NumType{N})); + } + + // Destroy tries. If destruction is recursive and MaxN is high enough, these + // will both fail. + destroyTrie(); +} + +struct NumWithDestructorT { + uint64_t Num; + ~NumWithDestructorT() {} +}; + +using NodeWithDestructorTrieTest = + SimpleTrieHashMapTest; + +TEST_F(NodeWithDestructorTrieTest, TrieDestructionLoop) { + // Test destroying large Trie. Make sure there is no recursion that can + // overflow the stack. + + // Limit the tries to 2 slots (1 bit) to generate subtries at a higher rate. + auto &Trie = createTrie(/*NumRootBits=*/1, /*NumSubtrieBits=*/1); + + // Fill them up. Pick a MaxN high enough to cause a stack overflow in debug + // builds. + static constexpr uint64_t MaxN = 100000; + for (uint64_t N = 0; N != MaxN; ++N) { + HashType Hash = hash(N); + Trie.insert(TrieType::pointer(), TrieType::value_type(Hash, NumType{N})); + } + + // Destroy tries. If destruction is recursive and MaxN is high enough, these + // will both fail. + destroyTrie(); +} + +using NumStrNodeTrieTest = SimpleTrieHashMapTest; + +TEST_F(NumStrNodeTrieTest, TrieInsertLazy) { + for (unsigned RootBits : {2, 3, 6, 10}) { + for (unsigned SubtrieBits : {2, 3, 4}) { + auto &Trie = createTrie(RootBits, SubtrieBits); + for (int I = 0, E = 1000; I != E; ++I) { + TrieType::pointer Lookup; + HashType H = hash(I); + if (I & 1) + Lookup = Trie.find(H); + + auto insertNum = [&](uint64_t Num) { + std::string S = Twine(I).str(); + auto Hash = hash(Num); + return Trie.insertLazy( + Hash, [&](TrieType::LazyValueConstructor C) { C(std::move(S)); }); + }; + auto S1 = insertNum(I); + // The address of the Data should be the same. + EXPECT_EQ(&S1->Data, &insertNum(I)->Data); + + auto insertStr = [&](std::string S) { + int Num = std::stoi(S); + return insertNum(Num); + }; + std::string S2 = S1->Data; + // The address of the Data should be the same. + EXPECT_EQ(&S1->Data, &insertStr(S2)->Data); + } + for (int I = 0, E = 1000; I != E; ++I) { + std::string S = Twine(I).str(); + TrieType::pointer Lookup = Trie.find(hash(I)); + EXPECT_TRUE(Lookup); + if (!Lookup) + continue; + EXPECT_EQ(S, Lookup->Data); + } + } + } +} +} // end anonymous namespace diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp new file mode 100644 index 00000000000000..566630e8a3c363 --- /dev/null +++ b/llvm/unittests/CAS/ActionCacheTest.cpp @@ -0,0 +1,156 @@ +//===- ActionCacheTest.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ActionCache.h" +#include "CASTestConfig.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; + +TEST_P(CASTest, ActionCacheHit) { + std::shared_ptr CAS = createObjectStore(); + std::shared_ptr Cache = createActionCache(); + + std::optional ID; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID), + Succeeded()); + std::optional ResultID; + ASSERT_THAT_ERROR(Cache->put(*ID, *ID), Succeeded()); + ASSERT_THAT_ERROR(Cache->get(*ID).moveInto(ResultID), Succeeded()); + ASSERT_TRUE(ResultID); + std::optional Result = CAS->getReference(*ResultID); + ASSERT_TRUE(Result); + ASSERT_EQ(*ID, *Result); +} + +TEST_P(CASTest, ActionCacheMiss) { + std::shared_ptr CAS = createObjectStore(); + std::shared_ptr Cache = createActionCache(); + + std::optional ID1, ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(Cache->put(*ID1, *ID2), Succeeded()); + // This is a cache miss for looking up a key doesn't exist. + std::optional Result1; + ASSERT_THAT_ERROR(Cache->get(*ID2).moveInto(Result1), Succeeded()); + ASSERT_FALSE(Result1); + + ASSERT_THAT_ERROR(Cache->put(*ID2, *ID1), Succeeded()); + // Cache hit after adding the value. + std::optional Result2; + ASSERT_THAT_ERROR(Cache->get(*ID2).moveInto(Result2), Succeeded()); + ASSERT_TRUE(Result2); + std::optional Ref = CAS->getReference(*Result2); + ASSERT_TRUE(Ref); + ASSERT_EQ(*ID1, *Ref); +} + +TEST_P(CASTest, ActionCacheRewrite) { + std::shared_ptr CAS = createObjectStore(); + std::shared_ptr Cache = createActionCache(); + + std::optional ID1, ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(Cache->put(*ID1, *ID1), Succeeded()); + // Writing to the same key with different value is error. + ASSERT_THAT_ERROR(Cache->put(*ID1, *ID2), Failed()); + // Writing the same value multiple times to the same key is fine. + ASSERT_THAT_ERROR(Cache->put(*ID1, *ID1), Succeeded()); +} + +#if LLVM_ENABLE_ONDISK_CAS +TEST(OnDiskActionCache, ActionCacheResultInvalid) { + unittest::TempDir Temp("on-disk-cache", /*Unique=*/true); + std::unique_ptr CAS1 = createInMemoryCAS(); + std::unique_ptr CAS2 = createInMemoryCAS(); + + std::optional ID1, ID2, ID3; + ASSERT_THAT_ERROR(CAS1->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS1->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(CAS2->createProxy(std::nullopt, "1").moveInto(ID3), + Succeeded()); + + std::unique_ptr Cache1 = + cantFail(createOnDiskActionCache(Temp.path())); + // Test put and get. + ASSERT_THAT_ERROR(Cache1->put(*ID1, *ID2), Succeeded()); + std::optional Result; + ASSERT_THAT_ERROR(Cache1->get(*ID1).moveInto(Result), Succeeded()); + ASSERT_TRUE(Result); + + // Create OnDiskCAS from the same location but a different underlying CAS. + std::unique_ptr Cache2 = + cantFail(createOnDiskActionCache(Temp.path())); + // Loading an key that points to an invalid object. + std::optional Result2; + // Get will work but the resulting CASID doesn't exist in ObjectStore. + ASSERT_THAT_ERROR(Cache2->get(*ID3).moveInto(Result2), Succeeded()); + ASSERT_FALSE(CAS2->getReference(*Result2)); + // Write a different value will cause error. + ASSERT_THAT_ERROR(Cache2->put(*ID3, *ID3), Failed()); +} +#endif + +#ifndef _MSC_VER +/// FIXME: MSVC doesn't compile Error within Promise/Future correctly and will +/// result in unchecked error. Disable AsyncAPIs when using MSVC for now. +TEST_P(CASTest, ActionCacheAsync) { + std::shared_ptr CAS = createObjectStore(); + std::shared_ptr Cache = createActionCache(); + + { + std::optional ID; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID), + Succeeded()); + auto PutFuture = Cache->putFuture(*ID, *ID); + ASSERT_THAT_ERROR(PutFuture.get().take(), Succeeded()); + auto GetFuture = Cache->getFuture(*ID); + std::optional ResultID; + ASSERT_THAT_ERROR(GetFuture.get().take().moveInto(ResultID), Succeeded()); + ASSERT_TRUE(ResultID); + } + + std::optional ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + { + std::promise Promise; + auto Future = Promise.get_future(); + Cache->putAsync(*ID2, *ID2, false, + [Promise = std::move(Promise)](Error E) mutable { + Promise.set_value(std::move(E)); + }); + ASSERT_THAT_ERROR(Future.get().take(), Succeeded()); + } + { + std::promise Promise; + auto Future = Promise.get_future(); + Cache->getAsync(*ID2, false, + [Promise = std::move(Promise)]( + Expected> Value) mutable { + Promise.set_value(std::move(Value)); + }); + std::optional ResultID; + ASSERT_THAT_ERROR(Future.get().take().moveInto(ResultID), Succeeded()); + ASSERT_TRUE(ResultID); + } +} +#endif diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp new file mode 100644 index 00000000000000..fd5dfa0143e496 --- /dev/null +++ b/llvm/unittests/CAS/CASTestConfig.cpp @@ -0,0 +1,88 @@ +//===- CASTestConfig.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CASTestConfig.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/CAS/PluginCAS.h" +#include "llvm/Config/config.h" +#include "gtest/gtest.h" +#include +#include + +using namespace llvm; +using namespace llvm::cas; + +// See llvm/utils/unittest/UnitTestMain/TestMain.cpp +extern const char *TestMainArgv0; + +// Just a reachable symbol to ease resolving of the executable's path. +static std::string TestStringArg1("plugincas-test-string-arg1"); + +std::string llvm::unittest::cas::getCASPluginPath() { + std::string Executable = + sys::fs::getMainExecutable(TestMainArgv0, &TestStringArg1); + llvm::SmallString<256> PathBuf(sys::path::parent_path( + sys::path::parent_path(sys::path::parent_path(Executable)))); + std::string LibName = "libCASPluginTest"; + sys::path::append(PathBuf, "lib", LibName + LLVM_PLUGIN_EXT); + return std::string(PathBuf); +} + +TestingAndDir createInMemory(int I) { + std::unique_ptr CAS = createInMemoryCAS(); + std::unique_ptr Cache = createInMemoryActionCache(); + return TestingAndDir{std::move(CAS), std::move(Cache), std::nullopt}; +} + +INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest, + ::testing::Values(createInMemory)); + +#if LLVM_ENABLE_ONDISK_CAS +#ifndef _WIN32 +__attribute__((constructor)) static void configureCASTestEnv() { + // Restrict the size of the on-disk CAS for tests. This allows testing in + // constrained environments (e.g. small TMPDIR). It also prevents leaving + // behind large files on file systems that do not support sparse files if a + // test crashes before resizing the file. + static std::once_flag Flag; + std::call_once(Flag, [] { + size_t Limit = 100 * 1024 * 1024; + std::string LimitStr = std::to_string(Limit); + setenv("LLVM_CAS_MAX_MAPPING_SIZE", LimitStr.c_str(), /*overwrite=*/false); + }); +} +#endif + +TestingAndDir createOnDisk(int I) { + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + std::unique_ptr CAS; + EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + std::unique_ptr Cache; + EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache), + Succeeded()); + return TestingAndDir{std::move(CAS), std::move(Cache), std::move(Temp)}; +} +INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk)); + +TestingAndDir createPluginCASImpl(int I) { + using namespace llvm::unittest::cas; + unittest::TempDir Temp("plugin-cas", /*Unique=*/true); + std::optional< + std::pair, std::shared_ptr>> + DBs; + EXPECT_THAT_ERROR( + createPluginCASDatabases(getCASPluginPath(), Temp.path(), {}) + .moveInto(DBs), + Succeeded()); + return TestingAndDir{std::move(DBs->first), std::move(DBs->second), + std::move(Temp)}; +} +INSTANTIATE_TEST_SUITE_P(PluginCAS, CASTest, + ::testing::Values(createPluginCASImpl)); +#endif /* LLVM_ENABLE_ONDISK_CAS */ diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h new file mode 100644 index 00000000000000..830c6015488bd1 --- /dev/null +++ b/llvm/unittests/CAS/CASTestConfig.h @@ -0,0 +1,56 @@ +//===- CASTestConfig.h ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H +#define LLVM_UNITTESTS_CASTESTCONFIG_H + +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" +#include + +namespace llvm::unittest::cas { +std::string getCASPluginPath(); +} // namespace llvm::unittest::cas + +struct TestingAndDir { + std::shared_ptr CAS; + std::shared_ptr Cache; + std::optional Temp; +}; + +class CASTest + : public testing::TestWithParam> { +protected: + std::optional NextCASIndex; + + llvm::SmallVector Dirs; + + std::shared_ptr createObjectStore() { + auto TD = GetParam()(++(*NextCASIndex)); + if (TD.Temp) + Dirs.push_back(std::move(*TD.Temp)); + return std::move(TD.CAS); + } + std::shared_ptr createActionCache() { + auto TD = GetParam()(++(*NextCASIndex)); + if (TD.Temp) + Dirs.push_back(std::move(*TD.Temp)); + return std::move(TD.Cache); + } + void SetUp() { NextCASIndex = 0; } + void TearDown() { + NextCASIndex = std::nullopt; + Dirs.clear(); + } +}; + +#endif diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt new file mode 100644 index 00000000000000..4c81e0f4728406 --- /dev/null +++ b/llvm/unittests/CAS/CMakeLists.txt @@ -0,0 +1,29 @@ +if (LLVM_ENABLE_ONDISK_CAS) + add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1) + set(ADDITIONAL_CAS_TEST_DEPS "CASPluginTest") +endif() + +set(LLVM_LINK_COMPONENTS + Support + CAS + TestingSupport + ) + +add_llvm_unittest(CASTests + ActionCacheTest.cpp + CASTestConfig.cpp + HierarchicalTreeBuilderTest.cpp + ObjectStoreTest.cpp + OnDiskCommonUtils.h + OnDiskGraphDBTest.cpp + OnDiskHashMappedTrieTest.cpp + OnDiskKeyValueDBTest.cpp + PluginCASTest.cpp + TreeSchemaTest.cpp + UnifiedOnDiskCacheTest.cpp + + DEPENDS + ${ADDITIONAL_CAS_TEST_DEPS} + ) + +target_link_libraries(CASTests PRIVATE LLVMTestingSupport) diff --git a/llvm/unittests/CAS/HierarchicalTreeBuilderTest.cpp b/llvm/unittests/CAS/HierarchicalTreeBuilderTest.cpp new file mode 100644 index 00000000000000..ec8bfafe31f99e --- /dev/null +++ b/llvm/unittests/CAS/HierarchicalTreeBuilderTest.cpp @@ -0,0 +1,210 @@ +//===- HierarchicalTreeBuilderTest.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/HierarchicalTreeBuilder.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/Path.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; +using namespace llvm::cas; + +static std::unique_ptr getBufferForName(ObjectStore &CAS, + TreeSchema &Tree, + ObjectRef Root, + StringRef Name) { + std::unique_ptr Buffer = nullptr; + StringRef Filename = sys::path::filename(Name, sys::path::Style::posix); + StringRef Dirname = sys::path::parent_path(Name, sys::path::Style::posix); + auto Err = Tree.walkFileTreeRecursively( + CAS, Root, + [&](const NamedTreeEntry &Entry, + std::optional Proxy) -> Error { + if (Proxy && Entry.getName() == Dirname) { + if (auto File = Proxy->lookup(Filename)) { + auto Ref = File->getRef(); + auto Loaded = CAS.getProxy(Ref); + if (!Loaded) + return Loaded.takeError(); + Buffer = Loaded->getMemoryBuffer(); + } + } + return Error::success(); + }); + EXPECT_THAT_ERROR(std::move(Err), Succeeded()); + return Buffer; +} + +TEST(HierarchicalTreeBuilderTest, Flat) { + std::unique_ptr CAS = createInMemoryCAS(); + + auto make = [&](StringRef Content) { + return *expectedToOptional(CAS->storeFromString(std::nullopt, Content)); + }; + + HierarchicalTreeBuilder Builder; + Builder.push(make("1"), TreeEntry::Regular, "/file1"); + Builder.push(make("1"), TreeEntry::Regular, "/1"); + Builder.push(make("2"), TreeEntry::Regular, "/2"); + std::optional Root; + ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded()); + + TreeSchema Tree(*CAS); + ASSERT_TRUE(Tree.isNode(*Root)); + + std::unique_ptr F1 = + getBufferForName(*CAS, Tree, Root->getRef(), "/1"); + std::unique_ptr F2 = + getBufferForName(*CAS, Tree, Root->getRef(), "/2"); + std::unique_ptr Ffile1 = + getBufferForName(*CAS, Tree, Root->getRef(), "/file1"); + ASSERT_TRUE(Ffile1); + ASSERT_TRUE(F1); + ASSERT_TRUE(F2); + EXPECT_EQ("1", F1->getBuffer()); + EXPECT_EQ("2", F2->getBuffer()); + EXPECT_EQ("1", Ffile1->getBuffer()); +} + +TEST(HierarchicalTreeBuilderTest, Nested) { + std::unique_ptr CAS = createInMemoryCAS(); + + auto make = [&](StringRef Content) { + return *expectedToOptional(CAS->storeFromString(std::nullopt, Content)); + }; + + HierarchicalTreeBuilder Builder; + Builder.push(make("blob2"), TreeEntry::Regular, "/d2"); + Builder.push(make("blob1"), TreeEntry::Regular, "/t1/d1"); + Builder.push(make("blob3"), TreeEntry::Regular, "/t3/d3"); + Builder.push(make("blob1"), TreeEntry::Regular, "/t3/t1nested/d1"); + Builder.push(make("blob1"), TreeEntry::Regular, "/t3/t2/d1also"); + Builder.push(make("blob2"), TreeEntry::Regular, "/t3/t2/d2"); + std::optional Root; + ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded()); + + TreeSchema Tree(*CAS); + ASSERT_TRUE(Tree.isNode(*Root)); + + std::unique_ptr F1 = + getBufferForName(*CAS, Tree, Root->getRef(), "/1"); + std::unique_ptr T1D1 = + getBufferForName(*CAS, Tree, Root->getRef(), "/t1/d1"); + std::unique_ptr T1NestedD1 = + getBufferForName(*CAS, Tree, Root->getRef(), "/t3/t1nested/d1"); + std::unique_ptr T3T2D1Also = + getBufferForName(*CAS, Tree, Root->getRef(), "/t3/t2/d1also"); + std::unique_ptr T3TD3 = + getBufferForName(*CAS, Tree, Root->getRef(), "/t3/d3"); + ASSERT_TRUE(T1D1); + ASSERT_TRUE(T1NestedD1); + ASSERT_TRUE(T3T2D1Also); + ASSERT_TRUE(T3TD3); + + EXPECT_EQ("blob1", T1D1->getBuffer()); + EXPECT_EQ("blob1", T1NestedD1->getBuffer()); + EXPECT_EQ("blob1", T3T2D1Also->getBuffer()); + EXPECT_EQ("blob3", T3TD3->getBuffer()); +} + +TEST(HierarchicalTreeBuilderTest, MergeDirectories) { + std::unique_ptr CAS = createInMemoryCAS(); + + auto make = [&](StringRef Content) { + return *expectedToOptional(CAS->storeFromString(std::nullopt, Content)); + }; + + auto createRoot = [&](StringRef Blob, StringRef Path, + std::optional &Root) { + HierarchicalTreeBuilder Builder; + Builder.push(make(Blob), TreeEntry::Regular, Path); + + std::optional H; + ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(H), Succeeded()); + Root = CAS->getReference(*H); + }; + + std::optional Root1; + createRoot("blob1", "/t1/d1", Root1); + std::optional Root2; + createRoot("blob2", "/t1/d2", Root2); + std::optional Root3; + createRoot("blob3", "/t1/nested/d1", Root3); + + HierarchicalTreeBuilder Builder; + Builder.pushTreeContent(*Root1, "/"); + Builder.pushTreeContent(*Root2, ""); + Builder.pushTreeContent(*Root3, "/"); + Builder.pushTreeContent(*Root1, ""); + Builder.pushTreeContent(*Root1, "other1/nest"); + std::optional Root; + ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded()); + + TreeSchema Tree(*CAS); + ASSERT_TRUE(Tree.isNode(*Root)); + + std::unique_ptr T1D1 = + getBufferForName(*CAS, Tree, Root->getRef(), "/t1/d1"); + std::unique_ptr T1D2 = + getBufferForName(*CAS, Tree, Root->getRef(), "/t1/d2"); + std::unique_ptr T1NestedD1 = + getBufferForName(*CAS, Tree, Root->getRef(), "/t1/nested/d1"); + std::unique_ptr OtherT1D1 = + getBufferForName(*CAS, Tree, Root->getRef(), "/other1/nest/t1/d1"); + ASSERT_TRUE(T1D1); + ASSERT_TRUE(T1D2); + ASSERT_TRUE(T1NestedD1); + ASSERT_TRUE(OtherT1D1); + + EXPECT_EQ("blob1", T1D1->getBuffer()); + EXPECT_EQ("blob2", T1D2->getBuffer()); + EXPECT_EQ("blob3", T1NestedD1->getBuffer()); + EXPECT_EQ("blob1", OtherT1D1->getBuffer()); +} + +TEST(HierarchicalTreeBuilderTest, MergeDirectoriesConflict) { + std::unique_ptr CAS = createInMemoryCAS(); + + auto make = [&](StringRef Content) { + return *expectedToOptional(CAS->storeFromString(std::nullopt, Content)); + }; + + auto createRoot = [&](StringRef Blob, StringRef Path, + std::optional &Root) { + HierarchicalTreeBuilder Builder; + Builder.push(make(Blob), TreeEntry::Regular, Path); + ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded()); + }; + + std::optional Root1; + createRoot("blob1", "/t1/d1", Root1); + std::optional Root2; + createRoot("blob2", "/t1/d1", Root2); + std::optional Root3; + createRoot("blob3", "/t1/d1/nested", Root3); + + { + HierarchicalTreeBuilder Builder; + Builder.pushTreeContent(Root1->getRef(), ""); + Builder.pushTreeContent(Root2->getRef(), ""); + std::optional Root; + EXPECT_THAT_ERROR( + Builder.create(*CAS).moveInto(Root), + FailedWithMessage("duplicate path '/t1/d1' with different ID")); + } + { + HierarchicalTreeBuilder Builder; + Builder.pushTreeContent(Root1->getRef(), ""); + Builder.pushTreeContent(Root3->getRef(), ""); + std::optional Root; + EXPECT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), + FailedWithMessage("duplicate path '/t1/d1'")); + } +} diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp new file mode 100644 index 00000000000000..d6bcf20cc577a4 --- /dev/null +++ b/llvm/unittests/CAS/ObjectStoreTest.cpp @@ -0,0 +1,469 @@ +//===- ObjectStoreTest.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/ThreadPool.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +#include "CASTestConfig.h" + +using namespace llvm; +using namespace llvm::cas; + +TEST_P(CASTest, PrintIDs) { + std::shared_ptr CAS = createObjectStore(); + + std::optional ID1, ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + EXPECT_NE(ID1, ID2); + std::string PrintedID1 = ID1->toString(); + std::string PrintedID2 = ID2->toString(); + EXPECT_NE(PrintedID1, PrintedID2); + + std::optional ParsedID1, ParsedID2; + ASSERT_THAT_ERROR(CAS->parseID(PrintedID1).moveInto(ParsedID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->parseID(PrintedID2).moveInto(ParsedID2), Succeeded()); + EXPECT_EQ(ID1, ParsedID1); + EXPECT_EQ(ID2, ParsedID2); +} + +TEST_P(CASTest, Blobs) { + std::shared_ptr CAS1 = createObjectStore(); + StringRef ContentStrings[] = { + "word", + "some longer text std::string's local memory", + R"(multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text)", + }; + + SmallVector IDs; + for (StringRef Content : ContentStrings) { + // Use StringRef::str() to create a temporary std::string. This could cause + // problems if the CAS is storing references to the input string instead of + // copying it. + std::optional Blob; + ASSERT_THAT_ERROR(CAS1->createProxy(std::nullopt, Content).moveInto(Blob), + Succeeded()); + IDs.push_back(Blob->getID()); + + // Check basic printing of IDs. + EXPECT_EQ(IDs.back().toString(), IDs.back().toString()); + if (IDs.size() > 2) + EXPECT_NE(IDs.front().toString(), IDs.back().toString()); + } + + // Check that the blobs give the same IDs later. + for (int I = 0, E = IDs.size(); I != E; ++I) { + std::optional Blob; + ASSERT_THAT_ERROR( + CAS1->createProxy(std::nullopt, ContentStrings[I]).moveInto(Blob), + Succeeded()); + EXPECT_EQ(IDs[I], Blob->getID()); + } + + // Run validation on all CASIDs. + for (int I = 0, E = IDs.size(); I != E; ++I) + ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded()); + + // Check that the blobs can be retrieved multiple times. + for (int I = 0, E = IDs.size(); I != E; ++I) { + for (int J = 0, JE = 3; J != JE; ++J) { + std::optional Buffer; + ASSERT_THAT_ERROR(CAS1->getProxy(IDs[I]).moveInto(Buffer), Succeeded()); + EXPECT_EQ(ContentStrings[I], Buffer->getData()); + } + } + + // Confirm these blobs don't exist in a fresh CAS instance. + std::shared_ptr CAS2 = createObjectStore(); + for (int I = 0, E = IDs.size(); I != E; ++I) { + std::optional Proxy; + EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Proxy), Failed()); + } + + // Insert into the second CAS and confirm the IDs are stable. Getting them + // should work now. + for (int I = IDs.size(), E = 0; I != E; --I) { + auto &ID = IDs[I - 1]; + auto &Content = ContentStrings[I - 1]; + std::optional Blob; + ASSERT_THAT_ERROR(CAS2->createProxy(std::nullopt, Content).moveInto(Blob), + Succeeded()); + EXPECT_EQ(ID, Blob->getID()); + + std::optional Buffer; + ASSERT_THAT_ERROR(CAS2->getProxy(ID).moveInto(Buffer), Succeeded()); + EXPECT_EQ(Content, Buffer->getData()); + } +} + +TEST_P(CASTest, BlobsBig) { + // A little bit of validation that bigger blobs are okay. Climb up to 1MB. + std::shared_ptr CAS = createObjectStore(); + SmallString<256> String1 = StringRef("a few words"); + SmallString<256> String2 = StringRef("others"); + while (String1.size() < 1024U * 1024U) { + std::optional ID1; + std::optional ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String1).moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String1).moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_EQ(ID1, ID2); + + String1.append(String2); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String2).moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, String2).moveInto(ID2), + Succeeded()); + ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_EQ(ID1, ID2); + String2.append(String1); + } + + // Specifically check near 1MB for objects large enough they're likely to be + // stored externally in an on-disk CAS and will be near a page boundary. + SmallString<0> Storage; + const size_t InterestingSize = 1024U * 1024ULL; + const size_t SizeE = InterestingSize + 2; + if (Storage.size() < SizeE) + Storage.resize(SizeE, '\01'); + for (size_t Size = InterestingSize - 2; Size != SizeE; ++Size) { + StringRef Data(Storage.data(), Size); + std::optional Blob; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, Data).moveInto(Blob), + Succeeded()); + ASSERT_EQ(Data, Blob->getData()); + ASSERT_EQ(0, Blob->getData().end()[0]); + } +} + +TEST_P(CASTest, LeafNodes) { + std::shared_ptr CAS1 = createObjectStore(); + StringRef ContentStrings[] = { + "word", + "some longer text std::string's local memory", + R"(multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text +multiline text multiline text multiline text multiline text multiline text)", + }; + + SmallVector Nodes; + SmallVector IDs; + for (StringRef Content : ContentStrings) { + // Use StringRef::str() to create a temporary std::string. This could cause + // problems if the CAS is storing references to the input string instead of + // copying it. + std::optional Node; + ASSERT_THAT_ERROR( + CAS1->store(std::nullopt, arrayRefFromStringRef(Content)) + .moveInto(Node), + Succeeded()); + Nodes.push_back(*Node); + + // Check basic printing of IDs. + IDs.push_back(CAS1->getID(*Node)); + EXPECT_EQ(IDs.back().toString(), IDs.back().toString()); + EXPECT_EQ(Nodes.front(), Nodes.front()); + EXPECT_EQ(Nodes.back(), Nodes.back()); + EXPECT_EQ(IDs.front(), IDs.front()); + EXPECT_EQ(IDs.back(), IDs.back()); + if (Nodes.size() <= 1) + continue; + EXPECT_NE(Nodes.front(), Nodes.back()); + EXPECT_NE(IDs.front(), IDs.back()); + } + + // Check that the blobs give the same IDs later. + for (int I = 0, E = IDs.size(); I != E; ++I) { + std::optional Node; + ASSERT_THAT_ERROR(CAS1->store(std::nullopt, arrayRefFromStringRef( + ContentStrings[I])) + .moveInto(Node), + Succeeded()); + EXPECT_EQ(IDs[I], CAS1->getID(*Node)); + } + + // Check that the blobs can be retrieved multiple times. + for (int I = 0, E = IDs.size(); I != E; ++I) { + for (int J = 0, JE = 3; J != JE; ++J) { + std::optional Object; + ASSERT_THAT_ERROR(CAS1->getProxy(IDs[I]).moveInto(Object), Succeeded()); + ASSERT_TRUE(Object); + EXPECT_EQ(ContentStrings[I], Object->getData()); + } + } + + // Confirm these blobs don't exist in a fresh CAS instance. + std::shared_ptr CAS2 = createObjectStore(); + for (int I = 0, E = IDs.size(); I != E; ++I) { + std::optional Object; + EXPECT_THAT_ERROR(CAS2->getProxy(IDs[I]).moveInto(Object), Failed()); + } + + // Insert into the second CAS and confirm the IDs are stable. Getting them + // should work now. + for (int I = IDs.size(), E = 0; I != E; --I) { + auto &ID = IDs[I - 1]; + auto &Content = ContentStrings[I - 1]; + std::optional Node; + ASSERT_THAT_ERROR( + CAS2->store(std::nullopt, arrayRefFromStringRef(Content)) + .moveInto(Node), + Succeeded()); + EXPECT_EQ(ID, CAS2->getID(*Node)); + + std::optional Object; + ASSERT_THAT_ERROR(CAS2->getProxy(ID).moveInto(Object), Succeeded()); + ASSERT_TRUE(Object); + EXPECT_EQ(Content, Object->getData()); + } +} + +TEST_P(CASTest, NodesBig) { + std::shared_ptr CAS = createObjectStore(); + + // Specifically check near 1MB for objects large enough they're likely to be + // stored externally in an on-disk CAS, and such that one of them will be + // near a page boundary. + SmallString<0> Storage; + constexpr size_t InterestingSize = 1024U * 1024ULL; + constexpr size_t WordSize = sizeof(void *); + + // Start much smaller to account for headers. + constexpr size_t SizeB = InterestingSize - 8 * WordSize; + constexpr size_t SizeE = InterestingSize + 1; + if (Storage.size() < SizeE) + Storage.resize(SizeE, '\01'); + + SmallVector CreatedNodes; + // Avoid checking every size because this is an expensive test. Just check + // for data that is 8B-word-aligned, and one less. Also appending the created + // nodes as the references in the next block to check references are created + // correctly. + for (size_t Size = SizeB; Size < SizeE; Size += WordSize) { + for (bool IsAligned : {false, true}) { + StringRef Data(Storage.data(), Size - (IsAligned ? 0 : 1)); + std::optional Node; + ASSERT_THAT_ERROR(CAS->createProxy(CreatedNodes, Data).moveInto(Node), + Succeeded()); + ASSERT_EQ(Data, Node->getData()); + ASSERT_EQ(0, Node->getData().end()[0]); + ASSERT_EQ(Node->getNumReferences(), CreatedNodes.size()); + CreatedNodes.emplace_back(Node->getRef()); + } + } + + for (auto ID : CreatedNodes) + ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded()); +} + +/// Common test functionality for creating blobs in parallel. You can vary which +/// cas instances are the same or different, and the size of the created blobs. +static void testBlobsParallel(ObjectStore &Read1, ObjectStore &Read2, + ObjectStore &Write1, ObjectStore &Write2, + uint64_t BlobSize) { + SCOPED_TRACE(testBlobsParallel); + unsigned BlobCount = 100; + std::vector Blobs; + Blobs.reserve(BlobCount); + for (unsigned I = 0; I < BlobCount; ++I) { + std::string Blob; + Blob.reserve(BlobSize); + while (Blob.size() < BlobSize) { + auto R = sys::Process::GetRandomNumber(); + Blob.append((char *)&R, sizeof(R)); + } + assert(Blob.size() >= BlobSize); + Blob.resize(BlobSize); + Blobs.push_back(std::move(Blob)); + } + + std::mutex NodesMtx; + std::vector> CreatedNodes(BlobCount); + + auto Producer = [&](unsigned I, ObjectStore *CAS) { + std::optional Node; + EXPECT_THAT_ERROR(CAS->createProxy({}, Blobs[I]).moveInto(Node), + Succeeded()); + { + std::lock_guard L(NodesMtx); + CreatedNodes[I] = Node ? Node->getID() : CASID::getDenseMapTombstoneKey(); + } + }; + + auto Consumer = [&](unsigned I, ObjectStore *CAS) { + std::optional ID; + while (!ID) { + // Busy wait. + std::lock_guard L(NodesMtx); + ID = CreatedNodes[I]; + } + if (ID == CASID::getDenseMapTombstoneKey()) + // Producer failed; already reported. + return; + + std::optional Node; + ASSERT_THAT_ERROR(CAS->getProxy(*ID).moveInto(Node), Succeeded()); + EXPECT_EQ(Node->getData(), Blobs[I]); + }; + + StdThreadPool Threads; + for (unsigned I = 0; I < BlobCount; ++I) { + Threads.async(Consumer, I, &Read1); + Threads.async(Consumer, I, &Read2); + Threads.async(Producer, I, &Write1); + Threads.async(Producer, I, &Write2); + } + + Threads.wait(); +} + +static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) { + SCOPED_TRACE(testBlobsParallel1); + testBlobsParallel(CAS, CAS, CAS, CAS, BlobSize); +} + +TEST_P(CASTest, BlobsParallel) { + std::shared_ptr CAS = createObjectStore(); + uint64_t Size = 1ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); +} + +#ifdef EXPENSIVE_CHECKS +TEST_P(CASTest, BlobsBigParallel) { + std::shared_ptr CAS = createObjectStore(); + // 100k is large enough to be standalone files in our on-disk cas. + uint64_t Size = 100ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); +} +#endif + +#if LLVM_ENABLE_ONDISK_CAS +TEST(OnDiskCASTest, BlobsParallelMultiCAS) { + // This test intentionally uses symlinked paths to the same CAS to subvert the + // shared memory mappings that would normally be created within a single + // process. This breaks the lock file guarantees, so we must be careful not + // to create or destroy the CAS objects concurrently, which is when the locks + // are normally important. + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")), + std::error_code()); + + std::unique_ptr CAS1, CAS2, CAS3, CAS4; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4), + Succeeded()); + + uint64_t Size = 1ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size)); +} +TEST(OnDiskCASTest, BlobsBigParallelMultiCAS) { + // See comment in BlobsParallelMultiCAS. + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")), + std::error_code()); + + std::unique_ptr CAS1, CAS2, CAS3, CAS4; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4), + Succeeded()); + + // 100k is large enough to be standalone files in our on-disk cas. + uint64_t Size = 100ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size)); +} + +#ifndef _WIN32 // FIXME: resize support on Windows. +TEST(OnDiskCASTest, DiskSize) { + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + std::unique_ptr CAS; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + + uint64_t MaxSize = 100 * 1024 * 1024; + + // Check that we map the files to the correct size. + auto CheckFileSizes = [&](bool Mapped) { + bool FoundIndex = false, FoundData = false; + std::error_code EC; + for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC; + I.increment(EC)) { + if (StringRef(I->path()).ends_with(".index")) { + FoundIndex = true; + ASSERT_TRUE(I->status()); + if (Mapped) + EXPECT_EQ(I->status()->getSize(), MaxSize); + else + EXPECT_LT(I->status()->getSize(), MaxSize); + } + if (StringRef(I->path()).ends_with(".data")) { + FoundData = true; + ASSERT_TRUE(I->status()); + if (Mapped) + EXPECT_EQ(I->status()->getSize(), MaxSize); + else + EXPECT_LT(I->status()->getSize(), MaxSize); + } + } + ASSERT_TRUE(FoundIndex); + ASSERT_TRUE(FoundData); + }; + + // Check that we have the full mapping size when the CAS is open. + CheckFileSizes(/*Mapped=*/true); + CAS.reset(); + // Check that the CAS is shrunk to a smaller size. + CheckFileSizes(/*Mapped=*/false); + + // Repeat the checks when starting from an existing CAS. + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + CheckFileSizes(/*Mapped=*/true); + CAS.reset(); + CheckFileSizes(/*Mapped=*/false); +} +#endif +#endif diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h new file mode 100644 index 00000000000000..3978c6b054e8d3 --- /dev/null +++ b/llvm/unittests/CAS/OnDiskCommonUtils.h @@ -0,0 +1,69 @@ +//===- llvm/unittest/CAS/OnDiskCommonUtils.h --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/Support/BLAKE3.h" + +namespace llvm::unittest::cas { + +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +using HasherT = BLAKE3; +using HashType = decltype(HasherT::hash(std::declval &>())); +using ValueType = std::array; + +inline HashType digest(StringRef Data, ArrayRef> RefHashes) { + return BuiltinObjectHasher::hashObject( + RefHashes, arrayRefFromStringRef(Data)); +} + +inline ObjectID digest(OnDiskGraphDB &DB, StringRef Data, + ArrayRef Refs) { + SmallVector, 8> RefHashes; + for (ObjectID Ref : Refs) + RefHashes.push_back(DB.getDigest(Ref)); + HashType Digest = digest(Data, RefHashes); + return DB.getReference(Digest); +} + +inline HashType digest(StringRef Data) { + return HasherT::hash(arrayRefFromStringRef(Data)); +} + +inline ValueType valueFromString(StringRef S) { + ValueType Val; + llvm::copy(S.substr(0, sizeof(Val)), Val.data()); + return Val; +} + +inline Expected store(OnDiskGraphDB &DB, StringRef Data, + ArrayRef Refs) { + ObjectID ID = digest(DB, Data, Refs); + if (Error E = DB.store(ID, Refs, arrayRefFromStringRef(Data))) + return std::move(E); + return ID; +} + +inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS, + unsigned Indent = 0) { + std::optional Obj; + if (Error E = DB.load(ID).moveInto(Obj)) + return E; + if (!Obj) + return Error::success(); + OS.indent(Indent) << toStringRef(DB.getObjectData(*Obj)) << '\n'; + for (ObjectID Ref : DB.getObjectRefs(*Obj)) { + if (Error E = printTree(DB, Ref, OS, Indent + 2)) + return E; + } + return Error::success(); +} + +} // namespace llvm::unittest::cas diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp new file mode 100644 index 00000000000000..57ea061e4f43a7 --- /dev/null +++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp @@ -0,0 +1,284 @@ +//===- llvm/unittest/CAS/OnDiskGraphDBTest.cpp ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "OnDiskCommonUtils.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +#if LLVM_ENABLE_ONDISK_CAS + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; +using namespace llvm::unittest::cas; + +TEST(OnDiskGraphDBTest, Basic) { + unittest::TempDir Temp("ondiskcas", /*Unique=*/true); + std::unique_ptr DB; + ASSERT_THAT_ERROR( + OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB), + Succeeded()); + + auto digest = [&DB](StringRef Data, ArrayRef Refs) -> ObjectID { + return ::digest(*DB, Data, Refs); + }; + + auto store = [&](StringRef Data, + ArrayRef Refs) -> Expected { + return ::store(*DB, Data, Refs); + }; + + std::optional ID1; + ASSERT_THAT_ERROR(store("hello", {}).moveInto(ID1), Succeeded()); + + std::optional Obj1; + ASSERT_THAT_ERROR(DB->load(*ID1).moveInto(Obj1), Succeeded()); + ASSERT_TRUE(Obj1.has_value()); + EXPECT_EQ(toStringRef(DB->getObjectData(*Obj1)), "hello"); + + ArrayRef Digest1 = DB->getDigest(*ID1); + ObjectID ID2 = DB->getReference(Digest1); + EXPECT_EQ(ID1, ID2); + + ObjectID ID3 = digest("world", {}); + EXPECT_FALSE(DB->containsObject(ID3)); + std::optional Obj2; + ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded()); + EXPECT_FALSE(Obj2.has_value()); + + ASSERT_THAT_ERROR(DB->store(ID3, {}, arrayRefFromStringRef("world")), + Succeeded()); + EXPECT_TRUE(DB->containsObject(ID3)); + ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded()); + ASSERT_TRUE(Obj2.has_value()); + EXPECT_EQ(toStringRef(DB->getObjectData(*Obj2)), "world"); + + size_t LargeDataSize = 256LL * 1024LL; // 256K. + // The precise size number is not important, we mainly check that the large + // object will be properly accounted for. + EXPECT_TRUE(DB->getStorageSize() > 10 && + DB->getStorageSize() < LargeDataSize); + + SmallString<16> Buffer; + Buffer.resize(LargeDataSize); + ASSERT_THAT_ERROR(store(Buffer, {}).moveInto(ID1), Succeeded()); + size_t StorageSize = DB->getStorageSize(); + EXPECT_TRUE(StorageSize > LargeDataSize); + + // Close & re-open the DB and check that it reports the same storage size. + DB.reset(); + ASSERT_THAT_ERROR( + OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB), + Succeeded()); + EXPECT_EQ(DB->getStorageSize(), StorageSize); +} + +TEST(OnDiskGraphDBTest, FaultInSingleNode) { + unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true); + std::unique_ptr UpstreamDB; + ASSERT_THAT_ERROR( + OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType)) + .moveInto(UpstreamDB), + Succeeded()); + { + std::optional ID1; + ASSERT_THAT_ERROR(store(*UpstreamDB, "hello", {}).moveInto(ID1), + Succeeded()); + std::optional ID2; + ASSERT_THAT_ERROR(store(*UpstreamDB, "another", {}).moveInto(ID2), + Succeeded()); + std::optional ID3; + ASSERT_THAT_ERROR(store(*UpstreamDB, "world", {*ID1, *ID2}).moveInto(ID3), + Succeeded()); + } + + unittest::TempDir Temp("ondiskcas", /*Unique=*/true); + std::unique_ptr DB; + ASSERT_THAT_ERROR( + OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType), + std::move(UpstreamDB), + OnDiskGraphDB::FaultInPolicy::SingleNode) + .moveInto(DB), + Succeeded()); + + ObjectID ID1 = digest(*DB, "hello", {}); + ObjectID ID2 = digest(*DB, "another", {}); + ObjectID ID3 = digest(*DB, "world", {ID1, ID2}); + ObjectID ID4 = digest(*DB, "world", {}); + + EXPECT_TRUE(DB->containsObject(ID1)); + EXPECT_TRUE(DB->containsObject(ID2)); + EXPECT_TRUE(DB->containsObject(ID3)); + EXPECT_FALSE(DB->containsObject(ID4)); + + EXPECT_TRUE(DB->getExistingReference(digest("hello", {})).has_value()); + EXPECT_TRUE(DB->getExistingReference(DB->getDigest(ID3)).has_value()); + EXPECT_FALSE(DB->getExistingReference(digest("world", {})).has_value()); + + { + std::optional Obj; + ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded()); + ASSERT_TRUE(Obj.has_value()); + EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello"); + auto Refs = DB->getObjectRefs(*Obj); + EXPECT_TRUE(Refs.empty()); + } + { + std::optional Obj; + ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj), Succeeded()); + ASSERT_TRUE(Obj.has_value()); + EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "world"); + auto Refs = DB->getObjectRefs(*Obj); + ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2); + EXPECT_EQ(Refs.begin()[0], ID1); + EXPECT_EQ(Refs.begin()[1], ID2); + } + { + std::optional Obj; + ASSERT_THAT_ERROR(DB->load(ID4).moveInto(Obj), Succeeded()); + EXPECT_FALSE(Obj.has_value()); + } + + // Re-open the primary without chaining, to verify the data were copied from + // the upstream. + ASSERT_THAT_ERROR( + OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType), + /*UpstreamDB=*/nullptr, + OnDiskGraphDB::FaultInPolicy::SingleNode) + .moveInto(DB), + Succeeded()); + ID1 = digest(*DB, "hello", {}); + ID2 = digest(*DB, "another", {}); + ID3 = digest(*DB, "world", {ID1, ID2}); + EXPECT_TRUE(DB->containsObject(ID1)); + EXPECT_FALSE(DB->containsObject(ID2)); + EXPECT_TRUE(DB->containsObject(ID3)); + { + std::optional Obj; + ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded()); + ASSERT_TRUE(Obj.has_value()); + EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello"); + auto Refs = DB->getObjectRefs(*Obj); + EXPECT_TRUE(Refs.empty()); + } +} + +TEST(OnDiskGraphDBTest, FaultInFullTree) { + unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true); + std::unique_ptr UpstreamDB; + ASSERT_THAT_ERROR( + OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType)) + .moveInto(UpstreamDB), + Succeeded()); + HashType RootHash; + { + std::optional ID11; + ASSERT_THAT_ERROR(store(*UpstreamDB, "11", {}).moveInto(ID11), Succeeded()); + std::optional ID121; + ASSERT_THAT_ERROR(store(*UpstreamDB, "121", {}).moveInto(ID121), + Succeeded()); + std::optional ID12; + ASSERT_THAT_ERROR(store(*UpstreamDB, "12", {*ID121}).moveInto(ID12), + Succeeded()); + std::optional ID1; + ASSERT_THAT_ERROR(store(*UpstreamDB, "1", {*ID11, *ID12}).moveInto(ID1), + Succeeded()); + std::optional ID21; + ASSERT_THAT_ERROR(store(*UpstreamDB, "21", {}).moveInto(ID21), Succeeded()); + std::optional ID22; + ASSERT_THAT_ERROR(store(*UpstreamDB, "22", {}).moveInto(ID22), Succeeded()); + std::optional ID2; + ASSERT_THAT_ERROR( + store(*UpstreamDB, "2", {*ID12, *ID21, *ID22}).moveInto(ID2), + Succeeded()); + std::optional IDRoot; + ASSERT_THAT_ERROR(store(*UpstreamDB, "root", {*ID1, *ID2}).moveInto(IDRoot), + Succeeded()); + ArrayRef Digest = UpstreamDB->getDigest(*IDRoot); + ASSERT_EQ(Digest.size(), RootHash.size()); + llvm::copy(Digest, RootHash.data()); + } + + unittest::TempDir Temp("ondiskcas", /*Unique=*/true); + std::unique_ptr DB; + ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType), + std::move(UpstreamDB), + OnDiskGraphDB::FaultInPolicy::FullTree) + .moveInto(DB), + Succeeded()); + + { + ObjectID IDRoot = DB->getReference(RootHash); + std::optional Obj; + ASSERT_THAT_ERROR(DB->load(IDRoot).moveInto(Obj), Succeeded()); + ASSERT_TRUE(Obj.has_value()); + EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "root"); + auto Refs = DB->getObjectRefs(*Obj); + ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2); + } + + // Re-open the primary without chaining, to verify the data were copied from + // the upstream. + ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType), + /*UpstreamDB=*/nullptr, + OnDiskGraphDB::FaultInPolicy::FullTree) + .moveInto(DB), + Succeeded()); + + ObjectID IDRoot = DB->getReference(RootHash); + std::string PrintedTree; + raw_string_ostream OS(PrintedTree); + ASSERT_THAT_ERROR(printTree(*DB, IDRoot, OS), Succeeded()); + StringRef Expected = R"(root + 1 + 11 + 12 + 121 + 2 + 12 + 121 + 21 + 22 +)"; + EXPECT_EQ(PrintedTree, Expected); +} + +TEST(OnDiskGraphDBTest, FaultInPolicyConflict) { + auto tryFaultInPolicyConflict = [](OnDiskGraphDB::FaultInPolicy Policy1, + OnDiskGraphDB::FaultInPolicy Policy2) { + unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true); + std::unique_ptr UpstreamDB; + ASSERT_THAT_ERROR( + OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType)) + .moveInto(UpstreamDB), + Succeeded()); + + unittest::TempDir Temp("ondiskcas", /*Unique=*/true); + std::unique_ptr DB; + ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", + sizeof(HashType), + std::move(UpstreamDB), Policy1) + .moveInto(DB), + Succeeded()); + DB.reset(); + ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", + sizeof(HashType), + std::move(UpstreamDB), Policy2) + .moveInto(DB), + Failed()); + }; + // Open as 'single', then as 'full'. + tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::SingleNode, + OnDiskGraphDB::FaultInPolicy::FullTree); + // Open as 'full', then as 'single'. + tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::FullTree, + OnDiskGraphDB::FaultInPolicy::SingleNode); +} + +#endif // LLVM_ENABLE_ONDISK_CAS diff --git a/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp b/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp new file mode 100644 index 00000000000000..b8e5bf632dde25 --- /dev/null +++ b/llvm/unittests/CAS/OnDiskHashMappedTrieTest.cpp @@ -0,0 +1,146 @@ +//===- OnDiskHashMappedTrieTest.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskHashMappedTrie.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +#if LLVM_ENABLE_ONDISK_CAS +using namespace llvm; +using namespace llvm::cas; + +namespace { + +TEST(OnDiskHashMappedTrieTest, Insertion) { + unittest::TempDir Temp("on-disk-hash-mapped-trie", /*Unique=*/true); + + // Create tries with various sizes of hash and with data. + // + // NOTE: The check related to \a recoverFromFileOffset() catches a potential + // off-by-one bounds-checking bug when the trie record size (data + hash) add + // up to a multiple of 8B. Iterate through a few different hash sizes to + // check it both ways. + constexpr size_t MB = 1024u * 1024u; + constexpr size_t DataSize = 8; // Multiple of 8B. + for (size_t NumHashBytes : {1, 2, 4, 8}) { + size_t NumHashBits = NumHashBytes * 8; + + auto createTrie = [&]() { + return OnDiskHashMappedTrie::create( + Temp.path((Twine(NumHashBytes) + "B").str()), "index", + /*NumHashBits=*/NumHashBits, DataSize, /*MaxFileSize=*/MB, + /*NewInitialFileSize=*/std::nullopt); + }; + + std::optional Trie1; + ASSERT_THAT_ERROR(createTrie().moveInto(Trie1), Succeeded()); + std::optional Trie2; + ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded()); + + uint8_t Hash0Bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + uint8_t Hash1Bytes[8] = {1, 0, 0, 0, 0, 0, 0, 0}; + auto Hash0 = ArrayRef(Hash0Bytes).take_front(NumHashBytes); + auto Hash1 = ArrayRef(Hash1Bytes).take_front(NumHashBytes); + constexpr StringLiteral Data0v1Bytes = "data0.v1"; + constexpr StringLiteral Data0v2Bytes = "data0.v2"; + constexpr StringLiteral Data1Bytes = "data1..."; + static_assert(Data0v1Bytes.size() == DataSize, "math error"); + static_assert(Data0v2Bytes.size() == DataSize, "math error"); + static_assert(Data1Bytes.size() == DataSize, "math error"); + ArrayRef Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size()); + ArrayRef Data0v2 = ArrayRef(Data0v2Bytes.data(), Data0v2Bytes.size()); + ArrayRef Data1 = ArrayRef(Data1Bytes.data(), Data1Bytes.size()); + + // Lookup when trie is empty. + EXPECT_FALSE(Trie1->find(Hash0)); + + // Insert. + std::optional Offset; + std::optional> Data; + { + auto Insertion = Trie1->insert({Hash0, Data0v1}); + ASSERT_TRUE(Insertion); + EXPECT_EQ(Hash0, Insertion->Hash); + EXPECT_EQ(Data0v1, Insertion->Data); + EXPECT_TRUE(isAddrAligned(Align(8), Insertion->Data.data())); + + Offset = Insertion.getOffset(); + Data = Insertion->Data; + } + + // Find. + { + auto Lookup = Trie1->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Find in a different instance of the same on-disk trie that existed + // before the insertion. + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Create a new instance and check that too. + Trie2.reset(); + ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded()); + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Change the data. + llvm::copy(Data0v2, Data->data()); + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v2, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Find different hash. + EXPECT_FALSE(Trie1->find(Hash1)); + EXPECT_FALSE(Trie2->find(Hash1)); + + // Recover from an offset. + { + auto Recovered = Trie1->recoverFromFileOffset(*Offset); + ASSERT_TRUE(Recovered); + EXPECT_EQ(Offset->get(), Recovered.getOffset().get()); + EXPECT_EQ(Hash0, Recovered->Hash); + EXPECT_EQ(Data0v2, Recovered->Data); + } + + // Insert another thing. + { + auto Insertion = Trie1->insert({Hash1, Data1}); + ASSERT_TRUE(Insertion); + EXPECT_EQ(Hash1, Insertion->Hash); + EXPECT_EQ(Data1, Insertion->Data); + EXPECT_TRUE(isAddrAligned(Align(8), Insertion->Data.data())); + EXPECT_NE(Offset->get(), Insertion.getOffset().get()); + } + } +} + +} // namespace + +#endif diff --git a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp new file mode 100644 index 00000000000000..3edc5e77f64fb6 --- /dev/null +++ b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp @@ -0,0 +1,54 @@ +//===- llvm/unittest/CAS/OnDiskKeyValueDBTest.cpp -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "OnDiskCommonUtils.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +#if LLVM_ENABLE_ONDISK_CAS + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; +using namespace llvm::unittest::cas; + +TEST(OnDiskKeyValueDBTest, Basic) { + unittest::TempDir Temp("ondiskkv", /*Unique=*/true); + std::unique_ptr DB; + ASSERT_THAT_ERROR(OnDiskKeyValueDB::open(Temp.path(), "blake3", + sizeof(HashType), "test", + sizeof(ValueType)) + .moveInto(DB), + Succeeded()); + + { + std::optional> Val; + ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded()); + EXPECT_FALSE(Val.has_value()); + } + + ValueType ValW = valueFromString("world"); + ArrayRef Val; + ASSERT_THAT_ERROR(DB->put(digest("hello"), ValW).moveInto(Val), Succeeded()); + EXPECT_EQ(Val, ArrayRef(ValW)); + ASSERT_THAT_ERROR( + DB->put(digest("hello"), valueFromString("other")).moveInto(Val), + Succeeded()); + EXPECT_EQ(Val, ArrayRef(ValW)); + + { + std::optional> Val; + ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded()); + EXPECT_TRUE(Val.has_value()); + EXPECT_EQ(*Val, ArrayRef(ValW)); + } +} + +#endif // LLVM_ENABLE_ONDISK_CAS diff --git a/llvm/unittests/CAS/PluginCASTest.cpp b/llvm/unittests/CAS/PluginCASTest.cpp new file mode 100644 index 00000000000000..e7bf025bf1794b --- /dev/null +++ b/llvm/unittests/CAS/PluginCASTest.cpp @@ -0,0 +1,93 @@ +//===- llvm/unittest/CAS/PluginCASTest.cpp --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/PluginCAS.h" +#include "CASTestConfig.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Config/config.h" +#include "llvm/Support/Path.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +#if LLVM_ENABLE_ONDISK_CAS + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::unittest::cas; + +TEST(PluginCASTest, isMaterialized) { + unittest::TempDir Temp("plugin-cas", /*Unique=*/true); + std::string UpDir(Temp.path("up")); + std::string DownDir(Temp.path("down")); + std::pair PluginOpts[] = { + {"upstream-path", std::string(UpDir)}}; + + { + std::optional< + std::pair, std::shared_ptr>> + DBs; + ASSERT_THAT_ERROR( + createPluginCASDatabases(getCASPluginPath(), DownDir, PluginOpts) + .moveInto(DBs), + Succeeded()); + std::shared_ptr CAS; + std::shared_ptr AC; + std::tie(CAS, AC) = std::move(*DBs); + + std::optional ID1, ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "2").moveInto(ID2), + Succeeded()); + std::optional ID2Ref = CAS->getReference(*ID2); + ASSERT_TRUE(ID2Ref); + bool IsMaterialized = false; + ASSERT_THAT_ERROR(CAS->isMaterialized(*ID2Ref).moveInto(IsMaterialized), + Succeeded()); + EXPECT_TRUE(IsMaterialized); + ASSERT_THAT_ERROR(AC->put(*ID1, *ID2, /*Globally=*/true), Succeeded()); + } + + // Clear "local" cache. + sys::fs::remove_directories(DownDir); + + { + std::optional< + std::pair, std::shared_ptr>> + DBs; + ASSERT_THAT_ERROR( + createPluginCASDatabases(getCASPluginPath(), DownDir, PluginOpts) + .moveInto(DBs), + Succeeded()); + std::shared_ptr CAS; + std::shared_ptr AC; + std::tie(CAS, AC) = std::move(*DBs); + + std::optional ID1, ID2; + ASSERT_THAT_ERROR(CAS->createProxy(std::nullopt, "1").moveInto(ID1), + Succeeded()); + ASSERT_THAT_ERROR(AC->get(*ID1, /*Globally=*/true).moveInto(ID2), + Succeeded()); + std::optional ID2Ref = CAS->getReference(*ID2); + ASSERT_TRUE(ID2Ref); + bool IsMaterialized = false; + ASSERT_THAT_ERROR(CAS->isMaterialized(*ID2Ref).moveInto(IsMaterialized), + Succeeded()); + EXPECT_FALSE(IsMaterialized); + + std::optional Obj; + ASSERT_THAT_ERROR(CAS->getProxy(*ID2Ref).moveInto(Obj), Succeeded()); + ASSERT_THAT_ERROR(CAS->isMaterialized(*ID2Ref).moveInto(IsMaterialized), + Succeeded()); + EXPECT_TRUE(IsMaterialized); + } +} + +#endif // LLVM_ENABLE_ONDISK_CAS diff --git a/llvm/unittests/CAS/TreeSchemaTest.cpp b/llvm/unittests/CAS/TreeSchemaTest.cpp new file mode 100644 index 00000000000000..1ec27526153939 --- /dev/null +++ b/llvm/unittests/CAS/TreeSchemaTest.cpp @@ -0,0 +1,266 @@ +//===- TreeSchemaTest.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/TreeSchema.h" +#include "llvm/CAS/HierarchicalTreeBuilder.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; + +TEST(TreeSchemaTest, Trees) { + std::unique_ptr CAS1 = createInMemoryCAS(); + std::unique_ptr CAS2 = createInMemoryCAS(); + + auto createBlobInBoth = [&](StringRef Content) { + std::optional H1, H2; + EXPECT_THAT_ERROR(CAS1->storeFromString(std::nullopt, Content).moveInto(H1), + Succeeded()); + EXPECT_THAT_ERROR(CAS2->storeFromString(std::nullopt, Content).moveInto(H2), + Succeeded()); + EXPECT_EQ(CAS1->getID(*H1), CAS2->getID(*H2)); + return *H1; + }; + + ObjectRef Blob1 = createBlobInBoth("blob1"); + ObjectRef Blob2 = createBlobInBoth("blob2"); + ObjectRef Blob3 = createBlobInBoth("blob3"); + + SmallVector, 0> FlatTreeEntries = { + {}, + {NamedTreeEntry(Blob1, TreeEntry::Regular, "regular")}, + {NamedTreeEntry(Blob2, TreeEntry::Executable, "executable")}, + {NamedTreeEntry(Blob3, TreeEntry::Symlink, "symlink")}, + { + NamedTreeEntry(Blob1, TreeEntry::Regular, "various"), + NamedTreeEntry(Blob1, TreeEntry::Regular, "names"), + NamedTreeEntry(Blob1, TreeEntry::Regular, "that"), + NamedTreeEntry(Blob1, TreeEntry::Regular, "do"), + NamedTreeEntry(Blob1, TreeEntry::Regular, "not"), + NamedTreeEntry(Blob1, TreeEntry::Regular, "conflict"), + NamedTreeEntry(Blob1, TreeEntry::Regular, "but have spaces and..."), + NamedTreeEntry(Blob1, TreeEntry::Regular, + "`~,!@#$%^&*()-+=[]{}\\<>'\""), + }, + }; + + SmallVector FlatRefs; + SmallVector FlatIDs; + TreeSchema Schema1(*CAS1); + + for (ArrayRef Entries : FlatTreeEntries) { + std::optional H; + ASSERT_THAT_ERROR(Schema1.create(Entries).moveInto(H), Succeeded()); + FlatIDs.push_back(H->getID()); + FlatRefs.push_back(H->getRef()); + } + + // Confirm we get the same IDs the second time and that the trees can be + // visited (the entries themselves will be checked later). + for (int I = 0, E = FlatIDs.size(); I != E; ++I) { + std::optional H; + ASSERT_THAT_ERROR(Schema1.create(FlatTreeEntries[I]).moveInto(H), + Succeeded()); + EXPECT_EQ(FlatRefs[I], CAS1->getReference(*H)); + std::optional Tree; + ASSERT_THAT_ERROR(TreeProxy::get(Schema1, *H).moveInto(Tree), Succeeded()); + EXPECT_EQ(FlatTreeEntries[I].size(), Tree->size()); + + size_t NumCalls = 0; + EXPECT_THAT_ERROR(Tree->forEachEntry([&NumCalls](const NamedTreeEntry &E) { + ++NumCalls; + return Error::success(); + }), + Succeeded()); + EXPECT_EQ(FlatTreeEntries[I].size(), NumCalls); + } + + // Run validation. + for (int I = 1, E = FlatIDs.size(); I != E; ++I) + ASSERT_THAT_ERROR(CAS1->validate(FlatIDs[I]), Succeeded()); + + // Confirm these trees don't exist in a fresh CAS instance. Skip the first + // tree, which is empty and could be implicitly in some CAS. + for (int I = 1, E = FlatIDs.size(); I != E; ++I) + EXPECT_FALSE(CAS2->getReference(FlatIDs[I])); + + // Insert into the other CAS and confirm the IDs are stable. + for (int I = FlatIDs.size(), E = 0; I != E; --I) { + for (ObjectStore *CAS : {&*CAS1, &*CAS2}) { + TreeSchema Schema(*CAS); + auto &ID = FlatIDs[I - 1]; + // Make a copy of the original entries and sort them. + SmallVector NewEntries; + for (const NamedTreeEntry &Entry : FlatTreeEntries[I - 1]) { + std::optional NewRef = + CAS->getReference(CAS1->getID(Entry.getRef())); + ASSERT_TRUE(NewRef); + NewEntries.emplace_back(*NewRef, Entry.getKind(), Entry.getName()); + } + llvm::sort(NewEntries); + + // Confirm we get the same tree out of CAS2. + { + std::optional Tree; + ASSERT_THAT_ERROR(Schema.create(NewEntries).moveInto(Tree), + Succeeded()); + EXPECT_EQ(ID, Tree->getID()); + } + + // Check that the correct entries come back. + std::optional Ref = CAS->getReference(ID); + ASSERT_TRUE(Ref); + std::optional Tree; + ASSERT_THAT_ERROR(Schema.load(*Ref).moveInto(Tree), Succeeded()); + for (int I = 0, E = NewEntries.size(); I != E; ++I) + EXPECT_EQ(NewEntries[I], Tree->get(I)); + } + } + + // Create some nested trees. + SmallVector NestedTrees = FlatRefs; + for (int I = 0, E = FlatTreeEntries.size() * 3; I != E; ++I) { + // Copy one of the flat entries and add some trees. + auto OriginalEntries = + ArrayRef(FlatTreeEntries[I % FlatTreeEntries.size()]); + SmallVector Entries(OriginalEntries.begin(), + OriginalEntries.end()); + std::string Name = ("tree" + Twine(I)).str(); + Entries.emplace_back(*CAS1->getReference(FlatIDs[(I + 4) % FlatIDs.size()]), + TreeEntry::Tree, Name); + + std::optional Name1, Name2; + if (NestedTrees.size() >= 2) { + int Nested1 = I % NestedTrees.size(); + int Nested2 = (I * 3 + 2) % NestedTrees.size(); + if (Nested2 == Nested1) + Nested2 = (Nested1 + 1) % NestedTrees.size(); + ASSERT_NE(Nested1, Nested2); + Name1.emplace(("tree" + Twine(I) + "-" + Twine(Nested1)).str()); + Name2.emplace(("tree" + Twine(I) + "-" + Twine(Nested2)).str()); + + Entries.emplace_back(NestedTrees[I % NestedTrees.size()], TreeEntry::Tree, + *Name1); + Entries.emplace_back(NestedTrees[(I * 3 + 2) % NestedTrees.size()], + TreeEntry::Tree, *Name2); + } + std::optional ID; + { + std::optional Tree; + ASSERT_THAT_ERROR(Schema1.create(Entries).moveInto(Tree), Succeeded()); + ID = Tree->getID(); + } + + llvm::sort(Entries); + for (ObjectStore *CAS : {&*CAS1, &*CAS2}) { + // Make a copy of the original entries and sort them. + SmallVector NewEntries; + for (const NamedTreeEntry &Entry : Entries) { + std::optional NewRef = + CAS->getReference(CAS1->getID(Entry.getRef())); + ASSERT_TRUE(NewRef); + NewEntries.emplace_back(*NewRef, Entry.getKind(), Entry.getName()); + } + llvm::sort(NewEntries); + + TreeSchema Schema(*CAS); + std::optional Tree; + ASSERT_THAT_ERROR(Schema.create(NewEntries).moveInto(Tree), Succeeded()); + ASSERT_EQ(*ID, Tree->getID()); + ASSERT_THAT_ERROR(CAS->validate(*ID), Succeeded()); + Tree.reset(); + std::optional Ref = CAS->getReference(*ID); + ASSERT_TRUE(Ref); + ASSERT_THAT_ERROR(Schema.load(*Ref).moveInto(Tree), Succeeded()); + for (int I = 0, E = NewEntries.size(); I != E; ++I) + EXPECT_EQ(NewEntries[I], Tree->get(I)); + } + } +} + +TEST(TreeSchemaTest, Lookup) { + std::unique_ptr CAS = createInMemoryCAS(); + std::optional Node; + EXPECT_THAT_ERROR(CAS->storeFromString(std::nullopt, "blob").moveInto(Node), + Succeeded()); + ObjectRef Blob = *Node; + SmallVector FlatTreeEntries = { + NamedTreeEntry(Blob, TreeEntry::Regular, "e"), + NamedTreeEntry(Blob, TreeEntry::Regular, "b"), + NamedTreeEntry(Blob, TreeEntry::Regular, "f"), + NamedTreeEntry(Blob, TreeEntry::Regular, "a"), + NamedTreeEntry(Blob, TreeEntry::Regular, "c"), + NamedTreeEntry(Blob, TreeEntry::Regular, "f"), + NamedTreeEntry(Blob, TreeEntry::Regular, "d"), + }; + std::optional Tree; + TreeSchema Schema(*CAS); + ASSERT_THAT_ERROR(Schema.create(FlatTreeEntries).moveInto(Tree), Succeeded()); + ASSERT_EQ(Tree->size(), (size_t)6); + auto CheckEntry = [&](StringRef Name) { + auto MaybeEntry = Tree->lookup(Name); + ASSERT_TRUE(MaybeEntry); + ASSERT_EQ(MaybeEntry->getName(), Name); + }; + CheckEntry("a"); + CheckEntry("b"); + CheckEntry("c"); + CheckEntry("d"); + CheckEntry("e"); + CheckEntry("f"); + ASSERT_FALSE(Tree->lookup("h")); +} + +TEST(TreeSchemaTest, walkFileTreeRecursively) { + std::unique_ptr CAS = createInMemoryCAS(); + + auto make = [&](StringRef Content) { + return cantFail(CAS->storeFromString(std::nullopt, Content)); + }; + + HierarchicalTreeBuilder Builder; + Builder.push(make("blob2"), TreeEntry::Regular, "/d2"); + Builder.push(make("blob1"), TreeEntry::Regular, "/t1/d1"); + Builder.push(make("blob3"), TreeEntry::Regular, "/t3/d3"); + Builder.push(make("blob1"), TreeEntry::Regular, "/t3/t1nested/d1"); + std::optional Root; + ASSERT_THAT_ERROR(Builder.create(*CAS).moveInto(Root), Succeeded()); + + std::pair ExpectedEntries[] = { + {"/", true}, + {"/d2", false}, + {"/t1", true}, + {"/t1/d1", false}, + {"/t3", true}, + {"/t3/d3", false}, + {"/t3/t1nested", true}, + {"/t3/t1nested/d1", false}, + }; + auto RemainingEntries = ArrayRef(ExpectedEntries); + + TreeSchema Schema(*CAS); + Error E = Schema.walkFileTreeRecursively( + *CAS, Root->getRef(), + [&](const NamedTreeEntry &Entry, std::optional Tree) -> Error { + if (RemainingEntries.empty()) + return createStringError(inconvertibleErrorCode(), + "unexpected entry: '" + Entry.getName() + + "'"); + auto ExpectedEntry = RemainingEntries.front(); + RemainingEntries = RemainingEntries.drop_front(); + EXPECT_EQ(ExpectedEntry.first, Entry.getName()); + EXPECT_EQ(ExpectedEntry.second, Tree.has_value()); + return Error::success(); + }); + EXPECT_THAT_ERROR(std::move(E), Succeeded()); +} diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp new file mode 100644 index 00000000000000..eafa07ff0f4d27 --- /dev/null +++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp @@ -0,0 +1,182 @@ +//===- llvm/unittest/CAS/UnifiedOnDiskCacheTest.cpp -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "OnDiskCommonUtils.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +#if LLVM_ENABLE_ONDISK_CAS + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; +using namespace llvm::unittest::cas; + +/// Visits all the files of a directory recursively and returns the sum of their +/// sizes. +static Expected countFileSizes(StringRef Path) { + size_t TotalSize = 0; + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() == sys::fs::file_type::directory_file) { + Expected Subsize = countFileSizes(DirI->path()); + if (!Subsize) + return Subsize.takeError(); + TotalSize += *Subsize; + continue; + } + ErrorOr Stat = DirI->status(); + if (!Stat) + return createFileError(DirI->path(), Stat.getError()); + TotalSize += Stat->getSize(); + } + if (EC) + return createFileError(Path, EC); + return TotalSize; +} + +TEST(UnifiedOnDiskCacheTest, Basic) { + unittest::TempDir Temp("ondisk-unified", /*Unique=*/true); + std::unique_ptr UniDB; + + auto reopenDB = [&]() { + UniDB.reset(); + const uint64_t SizeLimit = 1024ull * 64; + ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3", + sizeof(HashType)) + .moveInto(UniDB), + Succeeded()); + }; + + reopenDB(); + + HashType RootHash; + HashType OtherHash; + HashType Key1Hash; + HashType Key2Hash; + { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional ID1; + ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded()); + std::optional ID2; + ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded()); + std::optional IDRoot; + ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot), + Succeeded()); + ArrayRef Digest = DB.getDigest(*IDRoot); + ASSERT_EQ(Digest.size(), RootHash.size()); + llvm::copy(Digest, RootHash.data()); + + std::optional IDOther; + ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded()); + Digest = DB.getDigest(*IDOther); + ASSERT_EQ(Digest.size(), OtherHash.size()); + llvm::copy(Digest, OtherHash.data()); + + Key1Hash = digest("key1"); + std::optional Val; + ASSERT_THAT_ERROR(UniDB->KVPut(Key1Hash, *IDRoot).moveInto(Val), + Succeeded()); + EXPECT_EQ(IDRoot, Val); + + Key2Hash = digest("key2"); + ASSERT_THAT_ERROR( + UniDB->KVPut(DB.getReference(Key2Hash), *ID1).moveInto(Val), + Succeeded()); + } + + auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + ObjectID ID = DB.getReference(Digest); + std::string PrintedTree; + raw_string_ostream OS(PrintedTree); + ASSERT_THAT_ERROR(printTree(DB, ID, OS), Succeeded()); + EXPECT_EQ(PrintedTree, ExpectedTree); + }; + auto checkRootTree = [&]() { + return checkTree(RootHash, "root\n 1\n 2\n"); + }; + + auto checkKey = [&](const HashType &Key, StringRef ExpectedData) { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional Val; + ASSERT_THAT_ERROR(UniDB->KVGet(Key).moveInto(Val), Succeeded()); + ASSERT_TRUE(Val.has_value()); + std::optional Obj; + ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded()); + EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData); + }; + + checkRootTree(); + checkTree(OtherHash, "other\n"); + checkKey(Key1Hash, "root"); + checkKey(Key2Hash, "1"); + + auto storeBigObject = [&](unsigned Index) { + SmallString<1000> Buf; + Buf.append(970, 'a'); + raw_svector_ostream(Buf) << Index; + std::optional ID; + ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID), + Succeeded()); + }; + + unsigned Index = 0; + while (!UniDB->hasExceededSizeLimit()) { + storeBigObject(Index++); + } + + reopenDB(); + + EXPECT_FALSE(UniDB->hasExceededSizeLimit()); + EXPECT_FALSE(UniDB->needsGarbaseCollection()); + + checkRootTree(); + checkKey(Key1Hash, "root"); + + while (!UniDB->hasExceededSizeLimit()) { + storeBigObject(Index++); + } + ASSERT_THAT_ERROR(UniDB->close(), Succeeded()); + EXPECT_TRUE(UniDB->needsGarbaseCollection()); + + reopenDB(); + EXPECT_TRUE(UniDB->needsGarbaseCollection()); + + std::optional DirSizeBefore; + ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore), + Succeeded()); + + ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()), + Succeeded()); + + std::optional DirSizeAfter; + ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter), + Succeeded()); + EXPECT_LT(*DirSizeAfter, *DirSizeBefore); + + reopenDB(); + EXPECT_FALSE(UniDB->needsGarbaseCollection()); + + checkRootTree(); + checkKey(Key1Hash, "root"); + + // 'Other' tree and 'Key2' got garbage-collected. + { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + EXPECT_FALSE(DB.containsObject(DB.getReference(OtherHash))); + std::optional Val; + ASSERT_THAT_ERROR(UniDB->KVGet(Key2Hash).moveInto(Val), Succeeded()); + EXPECT_FALSE(Val.has_value()); + } +} + +#endif // LLVM_ENABLE_ONDISK_CAS diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt index 911ede701982f6..f6a8acacfa4bce 100644 --- a/llvm/unittests/CMakeLists.txt +++ b/llvm/unittests/CMakeLists.txt @@ -20,6 +20,7 @@ add_subdirectory(AsmParser) add_subdirectory(BinaryFormat) add_subdirectory(Bitcode) add_subdirectory(Bitstream) +add_subdirectory(CAS) add_subdirectory(CGData) add_subdirectory(CodeGen) add_subdirectory(DebugInfo)