149 changes: 149 additions & 0 deletions compiler-rt/lib/xray/xray_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
//===-- xray_allocator.h ---------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of XRay, a dynamic runtime instrumentation system.
//
// Defines the allocator interface for an arena allocator, used primarily for
// the profiling runtime.
//
//===----------------------------------------------------------------------===//
#ifndef XRAY_ALLOCATOR_H
#define XRAY_ALLOCATOR_H

#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_mutex.h"
#include <cstddef>
#include <cstdint>

#include "sanitizer_common/sanitizer_internal_defs.h"

namespace __xray {

/// The Allocator type hands out fixed-sized chunks of memory that are
/// cache-line aligned and sized. This is useful for placement of
/// performance-sensitive data in memory that's frequently accessed. The
/// allocator also self-limits the peak memory usage to a dynamically defined
/// maximum.
///
/// N is the lower-bound size of the block of memory to return from the
/// allocation function. N is used to compute the size of a block, which is
/// cache-line-size multiples worth of memory. We compute the size of a block by
/// determining how many cache lines worth of memory is required to subsume N.
template <size_t N> struct Allocator {
// The Allocator returns memory as Block instances.
struct Block {
/// Compute the minimum cache-line size multiple that is >= N.
static constexpr auto Size =
kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0));
void *Data = nullptr;
};

private:
// A BlockLink will contain a fixed number of blocks, each with an identifier
// to specify whether it's been handed out or not. We keep track of BlockLink
// iterators, which are basically a pointer to the link and an offset into
// the fixed set of blocks associated with a link. The iterators are
// bidirectional.
//
// We're calling it a "link" in the context of seeing these as a chain of
// block pointer containers (i.e. links in a chain).
struct BlockLink {
static_assert(kCacheLineSize % sizeof(void *) == 0,
"Cache line size is not divisible by size of void*; none of "
"the assumptions of the BlockLink will hold.");

// We compute the number of pointers to areas in memory where we consider as
// individual blocks we've allocated. To ensure that instances of the
// BlockLink object are cache-line sized, we deduct one additional
// pointers worth representing the pointer to the previous link.
//
// This structure corresponds to the following layout:
//
// Blocks [ 0, 1, 2, .., BlockPtrCount - 1]
//
static constexpr auto BlockPtrCount =
(kCacheLineSize / sizeof(Block *)) - 1;

// FIXME: Align this to cache-line address boundaries?
Block Blocks[BlockPtrCount]{};
BlockLink *Prev = nullptr;
};

static_assert(sizeof(BlockLink) == kCacheLineSize,
"BlockLink instances must be cache-line-sized.");

static BlockLink NullLink;

// FIXME: Implement a freelist, in case we actually do intend to return memory
// to the allocator, as opposed to just de-allocating everything in one go?

size_t MaxMemory;
SpinMutex Mutex{};
BlockLink *Tail = &NullLink;
size_t Counter = 0;

BlockLink *NewChainLink() {
auto NewChain = reinterpret_cast<BlockLink *>(
InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize));
auto BackingStore = reinterpret_cast<char *>(InternalAlloc(
BlockLink::BlockPtrCount * Block::Size, nullptr, kCacheLineSize));
size_t Offset = 0;
DCHECK_NE(NewChain, nullptr);
DCHECK_NE(BackingStore, nullptr);
for (auto &B : NewChain->Blocks) {
B.Data = BackingStore + Offset;
Offset += Block::Size;
}
NewChain->Prev = Tail;
return NewChain;
}

public:
Allocator(size_t M, size_t PreAllocate) : MaxMemory(M) {
// FIXME: Implement PreAllocate support!
}

Block Allocate() {
SpinMutexLock Lock(&Mutex);
// Check whether we're over quota.
if (Counter * Block::Size >= MaxMemory)
return {};

size_t ChainOffset = Counter % BlockLink::BlockPtrCount;

Block B{};
BlockLink *Link = Tail;
if (UNLIKELY(Counter == 0 || ChainOffset == 0))
Tail = Link = NewChainLink();

B = Link->Blocks[ChainOffset];
++Counter;
return B;
}

~Allocator() NOEXCEPT {
// We need to deallocate all the blocks, including the chain links.
for (auto *C = Tail; C != &NullLink;) {
// We know that the data block is a large contiguous page, we deallocate
// that at once.
InternalFree(C->Blocks[0].Data);
auto Prev = C->Prev;
InternalFree(C);
C = Prev;
}
}
}; // namespace __xray

// Storage for the NullLink sentinel.
template <size_t N> typename Allocator<N>::BlockLink Allocator<N>::NullLink;

} // namespace __xray

#endif // XRAY_ALLOCATOR_H
337 changes: 337 additions & 0 deletions compiler-rt/lib/xray/xray_segmented_array.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
//===-- xray_segmented_array.h ---------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of XRay, a dynamic runtime instrumentation system.
//
// Defines the implementation of a segmented array, with fixed-size chunks
// backing the segments.
//
//===----------------------------------------------------------------------===//
#ifndef XRAY_SEGMENTED_ARRAY_H
#define XRAY_SEGMENTED_ARRAY_H

#include "sanitizer_common/sanitizer_allocator.h"
#include "xray_allocator.h"
#include <type_traits>
#include <utility>

namespace __xray {

namespace {

constexpr size_t gcd(size_t a, size_t b) {
return (b == 0) ? a : gcd(b, a % b);
}

constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }

} // namespace

/// The Array type provides an interface similar to std::vector<...> but does
/// not shrink in size. Once constructed, elements can be appended but cannot be
/// removed. The implementation is heavily dependent on the contract provided by
/// the Allocator type, in that all memory will be released when the Allocator
/// is destroyed. When an Array is destroyed, it will destroy elements in the
/// backing store but will not free the memory. The parameter N defines how many
/// elements of T there should be in a single block.
///
/// We compute the least common multiple of the size of T and the cache line
/// size, to allow us to maximise the number of T objects we can place in
/// cache-line multiple sized blocks. To get back the number of T's, we divide
/// this least common multiple by the size of T.
template <class T, size_t N = lcm(sizeof(T), kCacheLineSize) / sizeof(T)>
struct Array {
static constexpr size_t ChunkSize = N;
static constexpr size_t AllocatorChunkSize = sizeof(T) * ChunkSize;
using AllocatorType = Allocator<AllocatorChunkSize>;
static_assert(std::is_trivially_destructible<T>::value,
"T must be trivially destructible.");

private:
// TODO: Consider co-locating the chunk information with the data in the
// Block, as in an intrusive list -- i.e. putting the next and previous
// pointer values inside the Block storage.
struct Chunk {
typename AllocatorType::Block Block;
static constexpr size_t Size = N;
Chunk *Prev = nullptr;
Chunk *Next = nullptr;
};

static Chunk SentinelChunk;

AllocatorType *Allocator;
Chunk *Head = &SentinelChunk;
Chunk *Tail = &SentinelChunk;
size_t Size = 0;
size_t FreeElements = 0;

// Here we keep track of chunks in the freelist, to allow us to re-use chunks
// when elements are trimmed off the end.
Chunk *Freelist = &SentinelChunk;

Chunk *NewChunk() {
// We need to handle the case in which enough elements have been trimmed to
// allow us to re-use chunks we've allocated before. For this we look into
// the Freelist, to see whether we need to actually allocate new blocks or
// just re-use blocks we've already seen before.
if (Freelist != &SentinelChunk) {
auto *FreeChunk = Freelist;
Freelist = FreeChunk->Next;
FreeChunk->Next = &SentinelChunk;
return FreeChunk;
}

auto Block = Allocator->Allocate();
if (Block.Data == nullptr)
return nullptr;
// TODO: Maybe use a separate managed allocator for Chunk instances?
auto C = reinterpret_cast<Chunk *>(InternalAlloc(sizeof(Chunk)));
if (C == nullptr)
return nullptr;
C->Block = Block;
return C;
}

static AllocatorType &GetGlobalAllocator() {
static AllocatorType *const GlobalAllocator = [] {
AllocatorType *A = reinterpret_cast<AllocatorType *>(
InternalAlloc(sizeof(AllocatorType)));
new (A) AllocatorType(2 << 10, 0);
return A;
}();

return *GlobalAllocator;
}

Chunk *InitHeadAndTail() {
DCHECK_EQ(Head, &SentinelChunk);
DCHECK_EQ(Tail, &SentinelChunk);
auto Chunk = NewChunk();
if (Chunk == nullptr)
return nullptr;
Chunk->Prev = &SentinelChunk;
Chunk->Next = &SentinelChunk;
Head = Chunk;
Tail = Chunk;
return Chunk;
}

Chunk *AppendNewChunk() {
auto Chunk = NewChunk();
if (Chunk == nullptr)
return nullptr;
Tail->Next = Chunk;
Chunk->Prev = Tail;
Chunk->Next = &SentinelChunk;
Tail = Chunk;
return Chunk;
}

// This Iterator models a BidirectionalIterator.
template <class U> class Iterator {
Chunk *C = nullptr;
size_t Offset = 0;

public:
Iterator(Chunk *IC, size_t Off) : C(IC), Offset(Off) {}

Iterator &operator++() {
DCHECK_NE(C, &SentinelChunk);
if (++Offset % N)
return *this;

// At this point, we know that Offset % N == 0, so we must advance the
// chunk pointer.
DCHECK_EQ(Offset % N, 0);
C = C->Next;
return *this;
}

Iterator &operator--() {
DCHECK_NE(C, &SentinelChunk);
DCHECK_GT(Offset, 0);

// We check whether the offset was on a boundary before decrement, to see
// whether we need to retreat to the previous chunk.
if ((Offset-- % N) == 0)
C = C->Prev;
return *this;
}

Iterator operator++(int) {
Iterator Copy(*this);
++(*this);
return Copy;
}

Iterator operator--(int) {
Iterator Copy(*this);
--(*this);
return Copy;
}

template <class V, class W>
friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) {
return L.C == R.C && L.Offset == R.Offset;
}

template <class V, class W>
friend bool operator!=(const Iterator<V> &L, const Iterator<W> &R) {
return !(L == R);
}

U &operator*() const {
DCHECK_NE(C, &SentinelChunk);
return reinterpret_cast<U *>(C->Block.Data)[Offset % N];
}

U *operator->() const {
DCHECK_NE(C, &SentinelChunk);
return reinterpret_cast<U *>(C->Block.Data) + (Offset % N);
}
};

public:
explicit Array(AllocatorType &A) : Allocator(&A) {}
Array() : Array(GetGlobalAllocator()) {}

Array(const Array &) = delete;
Array(Array &&O) NOEXCEPT : Allocator(O.Allocator),
Head(O.Head),
Tail(O.Tail),
Size(O.Size) {
O.Head = &SentinelChunk;
O.Tail = &SentinelChunk;
O.Size = 0;
}

bool empty() const { return Size == 0; }

AllocatorType &allocator() const {
DCHECK_NE(Allocator, nullptr);
return *Allocator;
}

size_t size() const { return Size; }

T *Append(const T &E) {
if (UNLIKELY(Head == &SentinelChunk))
if (InitHeadAndTail() == nullptr)
return nullptr;

auto Offset = Size % N;
if (UNLIKELY(Size != 0 && Offset == 0))
if (AppendNewChunk() == nullptr)
return nullptr;

auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
*Position = E;
++Size;
FreeElements -= FreeElements ? 1 : 0;
return Position;
}

template <class... Args> T *AppendEmplace(Args &&... args) {
if (UNLIKELY(Head == &SentinelChunk))
if (InitHeadAndTail() == nullptr)
return nullptr;

auto Offset = Size % N;
if (UNLIKELY(Size != 0 && Offset == 0))
if (AppendNewChunk() == nullptr)
return nullptr;

auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
// In-place construct at Position.
new (Position) T(std::forward<Args>(args)...);
++Size;
FreeElements -= FreeElements ? 1 : 0;
return Position;
}

T &operator[](size_t Offset) const {
DCHECK_LE(Offset, Size);
// We need to traverse the array enough times to find the element at Offset.
auto C = Head;
while (Offset >= N) {
C = C->Next;
Offset -= N;
DCHECK_NE(C, &SentinelChunk);
}
auto Position = reinterpret_cast<T *>(C->Block.Data) + Offset;
return *Position;
}

T &front() const {
DCHECK_NE(Head, &SentinelChunk);
DCHECK_NE(Size, 0u);
return *reinterpret_cast<T *>(Head->Block.Data);
}

T &back() const {
DCHECK_NE(Tail, &SentinelChunk);
auto Offset = (Size - 1) % N;
return *(reinterpret_cast<T *>(Tail->Block.Data) + Offset);
}

template <class Predicate> T *find_element(Predicate P) const {
if (empty())
return nullptr;

auto E = end();
for (auto I = begin(); I != E; ++I)
if (P(*I))
return &(*I);

return nullptr;
}

/// Remove N Elements from the end. This leaves the blocks behind, and not
/// require allocation of new blocks for new elements added after trimming.
void trim(size_t Elements) {
DCHECK_LE(Elements, Size);
Size -= Elements;
FreeElements += Elements;

// Here we need to check whether we've cleared enough elements to warrant
// putting blocks on to the freelist. We determine whether we need to
// right-size the internal list, by keeping track of the number of "free"
// elements still in the array.
auto ChunksToTrim = FreeElements / N;
for (size_t i = 0; i < ChunksToTrim; ++i, FreeElements -= N) {
// Put the tail into the Freelist.
auto *FreeChunk = Tail;
Tail = Tail->Prev;
if (Tail == &SentinelChunk)
Head = Tail;
else
Tail->Next = &SentinelChunk;
FreeChunk->Next = Freelist;
FreeChunk->Prev = Freelist->Prev;
Freelist = FreeChunk;
}
}

// Provide iterators.
Iterator<T> begin() const { return Iterator<T>(Head, 0); }
Iterator<T> end() const { return Iterator<T>(Tail, Size); }
Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0); }
Iterator<const T> cend() const { return Iterator<const T>(Tail, Size); }
};

// We need to have this storage definition out-of-line so that the compiler can
// ensure that storage for the SentinelChunk is defined and has a single
// address.
template <class T, size_t N>
typename Array<T, N>::Chunk Array<T, N>::SentinelChunk;

} // namespace __xray

#endif // XRAY_SEGMENTED_ARRAY_H