139 changes: 66 additions & 73 deletions libc/benchmarks/JSONTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,23 @@ namespace {

Study getStudy() {
return Study{
HostState{
"CpuName", 123, {CacheInfo{"A", 1, 2, 3}, CacheInfo{"B", 4, 5, 6}}},
BenchmarkOptions{std::chrono::seconds(1), std::chrono::seconds(2), 10,
100, 6, 100, 0.1, 2, BenchmarkLog::Full},
StudyConfiguration{2, 3, SizeRange{4, 5, 6}, Align(8), 9, 10},
{FunctionMeasurements{"A",
{Measurement{3, std::chrono::seconds(3)},
Measurement{3, std::chrono::seconds(4)}}},
FunctionMeasurements{"B", {}}}};
}

static std::string SerializeToString(const Study &S) {
"StudyName",
Runtime{HostState{"CpuName",
123,
{CacheInfo{"A", 1, 2, 3}, CacheInfo{"B", 4, 5, 6}}},
456, 789,
BenchmarkOptions{std::chrono::seconds(1), std::chrono::seconds(2),
10, 100, 6, 100, 0.1, 2, BenchmarkLog::Full}},
StudyConfiguration{std::string("Function"), 30U, false, 32U,
std::string("Distribution"), Align(16), 3U},
{std::chrono::seconds(3), std::chrono::seconds(4)}};
}

static std::string serializeToString(const Study &S) {
std::string Buffer;
raw_string_ostream RSO(Buffer);
json::OStream JOS(RSO);
SerializeToJson(S, JOS);
serializeToJson(S, JOS);
return Buffer;
}

Expand All @@ -54,14 +55,25 @@ MATCHER(EqualsCacheInfo, "") {
A, result_listener);
}

auto Equals(const HostState &H) -> auto {
auto equals(const HostState &H) -> auto {
return AllOf(
Field(&HostState::CpuName, H.CpuName),
Field(&HostState::CpuFrequency, H.CpuFrequency),
Field(&HostState::Caches, Pointwise(EqualsCacheInfo(), H.Caches)));
}

auto Equals(const BenchmarkOptions &BO) -> auto {
auto equals(const StudyConfiguration &SC) -> auto {
return AllOf(
Field(&StudyConfiguration::Function, SC.Function),
Field(&StudyConfiguration::NumTrials, SC.NumTrials),
Field(&StudyConfiguration::IsSweepMode, SC.IsSweepMode),
Field(&StudyConfiguration::SweepModeMaxSize, SC.SweepModeMaxSize),
Field(&StudyConfiguration::SizeDistributionName, SC.SizeDistributionName),
Field(&StudyConfiguration::AccessAlignment, SC.AccessAlignment),
Field(&StudyConfiguration::MemcmpMismatchAt, SC.MemcmpMismatchAt));
}

auto equals(const BenchmarkOptions &BO) -> auto {
return AllOf(
Field(&BenchmarkOptions::MinDuration, BO.MinDuration),
Field(&BenchmarkOptions::MaxDuration, BO.MaxDuration),
Expand All @@ -74,96 +86,73 @@ auto Equals(const BenchmarkOptions &BO) -> auto {
Field(&BenchmarkOptions::Log, BO.Log));
}

auto Equals(const SizeRange &SR) -> auto {
return AllOf(Field(&SizeRange::From, SR.From), Field(&SizeRange::To, SR.To),
Field(&SizeRange::Step, SR.Step));
}

auto Equals(const StudyConfiguration &SC) -> auto {
return AllOf(
Field(&StudyConfiguration::Runs, SC.Runs),
Field(&StudyConfiguration::BufferSize, SC.BufferSize),
Field(&StudyConfiguration::Size, Equals(SC.Size)),
Field(&StudyConfiguration::AddressAlignment, SC.AddressAlignment),
Field(&StudyConfiguration::MemsetValue, SC.MemsetValue),
Field(&StudyConfiguration::MemcmpMismatchAt, SC.MemcmpMismatchAt));
}

MATCHER(EqualsMeasurement, "") {
const Measurement &A = ::testing::get<0>(arg);
const Measurement &B = ::testing::get<1>(arg);
return ExplainMatchResult(AllOf(Field(&Measurement::Size, B.Size),
Field(&Measurement::Runtime, B.Runtime)),
A, result_listener);
auto equals(const Runtime &RI) -> auto {
return AllOf(Field(&Runtime::Host, equals(RI.Host)),
Field(&Runtime::BufferSize, RI.BufferSize),
Field(&Runtime::BatchParameterCount, RI.BatchParameterCount),
Field(&Runtime::BenchmarkOptions, equals(RI.BenchmarkOptions)));
}

MATCHER(EqualsFunctions, "") {
const FunctionMeasurements &A = ::testing::get<0>(arg);
const FunctionMeasurements &B = ::testing::get<1>(arg);
return ExplainMatchResult(
AllOf(Field(&FunctionMeasurements::Name, B.Name),
Field(&FunctionMeasurements::Measurements,
Pointwise(EqualsMeasurement(), B.Measurements))),
A, result_listener);
}

auto Equals(const Study &S) -> auto {
return AllOf(
Field(&Study::Host, Equals(S.Host)),
Field(&Study::Options, Equals(S.Options)),
Field(&Study::Configuration, Equals(S.Configuration)),
Field(&Study::Functions, Pointwise(EqualsFunctions(), S.Functions)));
auto equals(const Study &S) -> auto {
return AllOf(Field(&Study::StudyName, S.StudyName),
Field(&Study::Runtime, equals(S.Runtime)),
Field(&Study::Configuration, equals(S.Configuration)),
Field(&Study::Measurements, S.Measurements));
}

TEST(JsonTest, RoundTrip) {
const Study S = getStudy();
auto StudyOrError = ParseJsonStudy(SerializeToString(S));
const auto Serialized = serializeToString(S);
auto StudyOrError = parseJsonStudy(Serialized);
if (auto Err = StudyOrError.takeError())
EXPECT_FALSE(Err) << "Unexpected error";
EXPECT_FALSE(Err) << "Unexpected error : " << Err << "\n" << Serialized;
const Study &Parsed = *StudyOrError;
EXPECT_THAT(Parsed, Equals(S));
EXPECT_THAT(Parsed, equals(S)) << Serialized << "\n"
<< serializeToString(Parsed);
}

TEST(JsonTest, SupplementaryField) {
auto Failure = ParseJsonStudy(R"({
auto Failure = parseJsonStudy(R"({
"UnknownField": 10
}
)");
EXPECT_EQ(toString(Failure.takeError()), "Unknown field: UnknownField");
}

TEST(JsonTest, InvalidType) {
auto Failure = ParseJsonStudy(R"({
"Options": 1
auto Failure = parseJsonStudy(R"({
"Runtime": 1
}
)");
EXPECT_EQ(toString(Failure.takeError()), "Expected JSON Object");
}

TEST(JsonTest, InvalidDuration) {
auto Failure = ParseJsonStudy(R"({
"Options": {
"MinDuration": "Duration should be a Number"
auto Failure = parseJsonStudy(R"({
"Runtime": {
"BenchmarkOptions": {
"MinDuration": "Duration should be a Number"
}
}
}
)");
EXPECT_EQ(toString(Failure.takeError()), "Can't parse Duration");
}

TEST(JsonTest, InvalidAlignType) {
auto Failure = ParseJsonStudy(R"({
"Configuration":{
"AddressAlignment": "Align should be an Integer"
auto Failure = parseJsonStudy(R"({
"Configuration": {
"AccessAlignment": "Align should be an Integer"
}
}
)");
EXPECT_EQ(toString(Failure.takeError()), "Can't parse Align, not an Integer");
}

TEST(JsonTest, InvalidAlign) {
auto Failure = ParseJsonStudy(R"({
"Configuration":{
"AddressAlignment":3
auto Failure = parseJsonStudy(R"({
"Configuration": {
"AccessAlignment": 3
}
}
)");
Expand All @@ -172,9 +161,11 @@ TEST(JsonTest, InvalidAlign) {
}

TEST(JsonTest, InvalidBenchmarkLogType) {
auto Failure = ParseJsonStudy(R"({
"Options":{
"Log": 3
auto Failure = parseJsonStudy(R"({
"Runtime": {
"BenchmarkOptions":{
"Log": 3
}
}
}
)");
Expand All @@ -183,9 +174,11 @@ TEST(JsonTest, InvalidBenchmarkLogType) {
}

TEST(JsonTest, InvalidBenchmarkLog) {
auto Failure = ParseJsonStudy(R"({
"Options":{
"Log": "Unknown"
auto Failure = parseJsonStudy(R"({
"Runtime": {
"BenchmarkOptions":{
"Log": "Unknown"
}
}
}
)");
Expand Down
8 changes: 4 additions & 4 deletions libc/benchmarks/LibcBenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@
namespace llvm {
namespace libc_benchmarks {

// Makes sure the binary was compiled in release mode and that frequency
// governor is set on performance.
void checkRequirements();

using Duration = std::chrono::duration<double>;

enum class BenchmarkLog {
Expand Down Expand Up @@ -318,6 +314,10 @@ CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) {
return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size};
}

// Makes sure the binary was compiled in release mode and that frequency
// governor is set on performance.
void checkRequirements();

} // namespace libc_benchmarks
} // namespace llvm

Expand Down
36 changes: 19 additions & 17 deletions libc/benchmarks/LibcMemoryBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,40 +20,42 @@ namespace libc_benchmarks {
// When alignment is set, the distribution is scaled down by `Factor` and scaled
// up again by the same amount during sampling.
static std::uniform_int_distribution<uint32_t>
GetOffsetDistribution(const StudyConfiguration &Conf) {
if (Conf.AddressAlignment &&
*Conf.AddressAlignment > AlignedBuffer::Alignment)
getOffsetDistribution(size_t BufferSize, size_t MaxSizeValue,
MaybeAlign AccessAlignment) {
if (AccessAlignment && *AccessAlignment > AlignedBuffer::Alignment)
report_fatal_error(
"AddressAlignment must be less or equal to AlignedBuffer::Alignment");
if (!Conf.AddressAlignment)
"AccessAlignment must be less or equal to AlignedBuffer::Alignment");
if (!AccessAlignment)
return std::uniform_int_distribution<uint32_t>(0, 0); // Always 0.
// If we test up to Size bytes, the returned offset must stay under
// BuffersSize - Size.
int64_t MaxOffset = Conf.BufferSize;
MaxOffset -= Conf.Size.To;
int64_t MaxOffset = BufferSize;
MaxOffset -= MaxSizeValue;
MaxOffset -= 1;
if (MaxOffset < 0)
report_fatal_error(
"BufferSize too small to exercise specified Size configuration");
MaxOffset /= Conf.AddressAlignment->value();
MaxOffset /= AccessAlignment->value();
return std::uniform_int_distribution<uint32_t>(0, MaxOffset);
}

OffsetDistribution::OffsetDistribution(const StudyConfiguration &Conf)
: Distribution(GetOffsetDistribution(Conf)),
Factor(Conf.AddressAlignment.valueOrOne().value()) {}
OffsetDistribution::OffsetDistribution(size_t BufferSize, size_t MaxSizeValue,
MaybeAlign AccessAlignment)
: Distribution(
getOffsetDistribution(BufferSize, MaxSizeValue, AccessAlignment)),
Factor(AccessAlignment.valueOrOne().value()) {}

// Precomputes offset where to insert mismatches between the two buffers.
MismatchOffsetDistribution::MismatchOffsetDistribution(
const StudyConfiguration &Conf)
: MismatchAt(Conf.MemcmpMismatchAt) {
MismatchOffsetDistribution::MismatchOffsetDistribution(size_t BufferSize,
size_t MaxSizeValue,
size_t MismatchAt)
: MismatchAt(MismatchAt) {
if (MismatchAt <= 1)
return;
const auto ToSize = Conf.Size.To;
for (size_t I = ToSize + 1; I < Conf.BufferSize; I += ToSize)
for (size_t I = MaxSizeValue + 1; I < BufferSize; I += MaxSizeValue)
MismatchIndices.push_back(I);
if (MismatchIndices.empty())
llvm::report_fatal_error("Unable to generate mismatch");
report_fatal_error("Unable to generate mismatch");
MismatchIndexSelector =
std::uniform_int_distribution<size_t>(0, MismatchIndices.size() - 1);
}
Expand Down
147 changes: 66 additions & 81 deletions libc/benchmarks/LibcMemoryBenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H

#include "LibcBenchmark.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Alignment.h"
#include <cstdint>
Expand All @@ -26,66 +25,79 @@ namespace libc_benchmarks {
// Configuration
//--------------

// Specifies a range of sizes to explore.
struct SizeRange {
uint32_t From = 0; // Inclusive
uint32_t To = 1024; // Inclusive
uint32_t Step = 1;
struct StudyConfiguration {
// One of 'memcpy', 'memset', 'memcmp'.
// The underlying implementation is always the llvm libc one.
// e.g. 'memcpy' will test '__llvm_libc::memcpy'
std::string Function;

// The number of trials to run for this benchmark.
// If in SweepMode, each individual sizes are measured 'NumTrials' time.
// i.e 'NumTrials' measurements for 0, 'NumTrials' measurements for 1 ...
uint32_t NumTrials = 1;

// Toggles between Sweep Mode and Distribution Mode (default).
// See 'SweepModeMaxSize' and 'SizeDistributionName' below.
bool IsSweepMode = false;

// Maximum size to use when measuring a ramp of size values (SweepMode).
// The benchmark measures all sizes from 0 to SweepModeMaxSize.
// Note: in sweep mode the same size is sampled several times in a row this
// will allow the processor to learn it and optimize the branching pattern.
// The resulting measurement is likely to be idealized.
uint32_t SweepModeMaxSize = 0; // inclusive

// The name of the distribution to be used to randomize the size parameter.
// This is used when SweepMode is false (default).
std::string SizeDistributionName;

// This parameter allows to control how the buffers are accessed during
// benchmark:
// None : Use a fixed address that is at least cache line aligned,
// 1 : Use random address,
// >1 : Use random address aligned to value.
MaybeAlign AccessAlignment = None;

// When Function == 'memcmp', this is the buffers mismatch position.
// 0 : Buffers always compare equal,
// >0 : Buffers compare different at byte N-1.
uint32_t MemcmpMismatchAt = 0;
};

// An object to define how to test a memory function.
struct StudyConfiguration {
// The number of run for the study.
uint32_t Runs = 1;

// The size of the buffers (1 buffer for memset but 2 for memcpy or memcmp).
// When testing small sizes, it's important to keep the total allocated
// size under the size of the L1 cache (usually 16 or 32KiB). The framework
// will also use 2KiB of additional L1 memory to store the function
// parameters.
uint32_t BufferSize = 8192;

// The range of sizes to exercise.
SizeRange Size;

MaybeAlign AddressAlignment; // Unset : Use start of buffer which is at
// least cache line aligned)
// 1 : Use random address,
// >1 : Use random address aligned to value.

// The value to use for memset.
uint8_t MemsetValue = 0;

// The mismatch position for memcmp.
uint32_t MemcmpMismatchAt = 0; // 0 : Buffer compare equal,
// >0 : Buffer compare different at byte N-1.
struct Runtime {
// Details about the Host (cpu name, cpu frequency, cache hierarchy).
HostState Host;

// The framework will populate this value so all data accessed during the
// benchmark will stay in L1 data cache. This includes bookkeeping data.
uint32_t BufferSize = 0;

// This is the number of distinct parameters used in a single batch.
// The framework always tests a batch of randomized parameter to prevent the
// cpu from learning branching patterns.
uint32_t BatchParameterCount = 0;

// The benchmark options that were used to perform the measurement.
// This is decided by the framework.
BenchmarkOptions BenchmarkOptions;
};

//--------
// Results
//--------

// The time to run one iteration of the function under test for the specified
// Size.
struct Measurement {
uint32_t Size = 0;
Duration Runtime = {};
};

// The measurements for a specific function.
struct FunctionMeasurements {
std::string Name;
std::vector<Measurement> Measurements;
};

// The root object containing all the data (configuration and measurements).
struct Study {
HostState Host;
BenchmarkOptions Options;
std::string StudyName;
Runtime Runtime;
StudyConfiguration Configuration;
SmallVector<FunctionMeasurements, 4> Functions;
std::vector<Duration> Measurements;
};

//------
// Utils
//------

// Provides an aligned, dynamically allocated buffer.
class AlignedBuffer {
char *const Buffer = nullptr;
Expand All @@ -95,7 +107,8 @@ class AlignedBuffer {
static constexpr size_t Alignment = 1024;

explicit AlignedBuffer(size_t Size)
: Buffer(static_cast<char *>(aligned_alloc(1024, Size))), Size(Size) {}
: Buffer(static_cast<char *>(aligned_alloc(Alignment, Size))),
Size(Size) {}
~AlignedBuffer() { free(Buffer); }

inline char *operator+(size_t Index) { return Buffer + Index; }
Expand All @@ -106,44 +119,15 @@ class AlignedBuffer {
inline char *end() { return Buffer + Size; }
};

// Implements the ParameterProvider abstraction needed by the `benchmark`
// function. This implementation makes sure that all parameters will fit into
// `StorageSize` bytes. The total memory accessed during benchmark should be
// less than the data L1 cache, that is the storage for the ParameterProvider
// and the memory buffers.
template <typename Context, size_t StorageSize = 8 * 1024>
class SmallParameterProvider {
using ParameterType = typename Context::ParameterType;
ByteConstrainedArray<ParameterType, StorageSize> Parameters;
size_t LastIterations;
Context &Ctx;

public:
explicit SmallParameterProvider(Context &C) : Ctx(C) {}
SmallParameterProvider(const SmallParameterProvider &) = delete;
SmallParameterProvider &operator=(const SmallParameterProvider &) = delete;

// Useful to compute the histogram of the size parameter.
CircularArrayRef<ParameterType> getLastBatch() const {
return cycle(Parameters, LastIterations);
}

// Implements the interface needed by the `benchmark` function.
CircularArrayRef<ParameterType> generateBatch(size_t Iterations) {
LastIterations = Iterations;
Ctx.Randomize(Parameters);
return getLastBatch();
}
};

// Helper to generate random buffer offsets that satisfy the configuration
// constraints.
class OffsetDistribution {
std::uniform_int_distribution<uint32_t> Distribution;
uint32_t Factor;

public:
explicit OffsetDistribution(const StudyConfiguration &Conf);
explicit OffsetDistribution(size_t BufferSize, size_t MaxSizeValue,
MaybeAlign AccessAlignment);

template <class Generator> uint32_t operator()(Generator &G) {
return Distribution(G) * Factor;
Expand All @@ -159,7 +143,8 @@ class MismatchOffsetDistribution {
const uint32_t MismatchAt;

public:
explicit MismatchOffsetDistribution(const StudyConfiguration &Conf);
explicit MismatchOffsetDistribution(size_t BufferSize, size_t MaxSizeValue,
size_t MismatchAt);

explicit operator bool() const { return !MismatchIndices.empty(); }

Expand Down
359 changes: 305 additions & 54 deletions libc/benchmarks/LibcMemoryBenchmarkMain.cpp

Large diffs are not rendered by default.

36 changes: 0 additions & 36 deletions libc/benchmarks/LibcMemoryBenchmarkMain.h

This file was deleted.

53 changes: 20 additions & 33 deletions libc/benchmarks/LibcMemoryBenchmarkTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,16 @@ TEST(AlignedBuffer, Empty) {
}

TEST(OffsetDistribution, AlignToBegin) {
StudyConfiguration Conf;
Conf.BufferSize = 8192;
Conf.AddressAlignment = None;

OffsetDistribution OD(Conf);
const size_t BufferSize = 8192;
OffsetDistribution OD(BufferSize, 1024, None);
std::default_random_engine Gen;
for (size_t I = 0; I <= 10; ++I)
EXPECT_EQ(OD(Gen), 0U);
}

TEST(OffsetDistribution, NoAlignment) {
StudyConfiguration Conf;
Conf.BufferSize = 8192;
Conf.Size.To = 1;

OffsetDistribution OD(Conf);
const size_t BufferSize = 8192;
OffsetDistribution OD(BufferSize, 1, Align(1));
std::default_random_engine Gen;
for (size_t I = 0; I <= 10; ++I)
EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U)));
Expand All @@ -61,49 +55,42 @@ MATCHER_P(IsDivisibleBy, n, "") {
}

TEST(OffsetDistribution, Aligned) {
StudyConfiguration Conf;
Conf.BufferSize = 8192;
Conf.AddressAlignment = Align(16);
Conf.Size.To = 1;

OffsetDistribution OD(Conf);
const size_t BufferSize = 8192;
OffsetDistribution OD(BufferSize, 1, Align(16));
std::default_random_engine Gen;
for (size_t I = 0; I <= 10; ++I)
EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U), IsDivisibleBy(16U)));
}

TEST(MismatchOffsetDistribution, EqualBufferDisablesDistribution) {
StudyConfiguration Conf;
Conf.MemcmpMismatchAt = 0; // buffer are equal.
const size_t BufferSize = 8192;
const uint32_t MismatchAt = 0; // buffer are equal.

MismatchOffsetDistribution MOD(Conf);
MismatchOffsetDistribution MOD(BufferSize, 1024, MismatchAt);
EXPECT_FALSE(MOD);
}

TEST(MismatchOffsetDistribution, DifferentBufferDisablesDistribution) {
StudyConfiguration Conf;
Conf.MemcmpMismatchAt = 1; // buffer are different.
const size_t BufferSize = 8192;
const uint32_t MismatchAt = 1; // buffer are different.

MismatchOffsetDistribution MOD(Conf);
MismatchOffsetDistribution MOD(BufferSize, 1024, MismatchAt);
EXPECT_FALSE(MOD);
}

TEST(MismatchOffsetDistribution, MismatchAt2) {
const uint32_t MismatchAt = 2;
const uint32_t ToSize = 4;
StudyConfiguration Conf;
Conf.BufferSize = 16;
Conf.MemcmpMismatchAt = MismatchAt; // buffer are different at position 2.
Conf.Size.To = ToSize;

MismatchOffsetDistribution MOD(Conf);
const size_t BufferSize = 16;
const uint32_t MismatchAt = 2; // buffer are different at position 2.
const uint32_t MaxSize = 4;

MismatchOffsetDistribution MOD(BufferSize, MaxSize, MismatchAt);
EXPECT_TRUE(MOD);
// We test equality up to ToSize (=4) so we need spans of 4 equal bytes spaced
// by one mismatch.
// We test equality up to MaxSize (=4) so we need spans of 4 equal bytes
// spaced by one mismatch.
EXPECT_THAT(MOD.getMismatchIndices(), ElementsAre(5, 9, 13));
std::default_random_engine Gen;
for (size_t Iterations = 0; Iterations <= 10; ++Iterations) {
for (size_t Size = Conf.Size.From; Size <= ToSize; ++Size) {
for (size_t Size = 0; Size <= MaxSize; ++Size) {
if (Size >= MismatchAt)
EXPECT_THAT(MOD(Gen, Size),
AnyOf(5 - MismatchAt, 9 - MismatchAt, 13 - MismatchAt));
Expand Down
87 changes: 0 additions & 87 deletions libc/benchmarks/Memcmp.cpp

This file was deleted.

73 changes: 0 additions & 73 deletions libc/benchmarks/Memcpy.cpp

This file was deleted.

14 changes: 10 additions & 4 deletions libc/benchmarks/MemorySizeDistributions.cpp

Large diffs are not rendered by default.

70 changes: 0 additions & 70 deletions libc/benchmarks/Memset.cpp

This file was deleted.

4 changes: 2 additions & 2 deletions libc/benchmarks/RATIONALE.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ functions.

## Challenges

As seen in the [README.md](README.md#benchmarking-regimes) the microbenchmarking
As seen in the [README.md](README.md#stochastic-mode) the microbenchmarking
facility should focus on measuring **low latency code**. If copying a few bytes
takes in the order of a few cycles, the benchmark should be able to **measure
accurately down to the cycle**.
Expand Down Expand Up @@ -76,7 +76,7 @@ Each vendor decides which performance counters to implement and their exact
meaning. Although we want to benchmark `llvm-libc` memory functions for all
available [target
triples](https://clang.llvm.org/docs/CrossCompilation.html#target-triple), there
are **no guarantees that the counter we're interested in is available.**
are **no guarantees that the counter we're interested in is available.**

### Additional imprecisions

Expand Down
123 changes: 64 additions & 59 deletions libc/benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -1,65 +1,59 @@
# Libc mem* benchmarks

This framework has been designed to evaluate and compare relative performance of
memory function implementations on a particular host.
This framework has been designed to evaluate and compare relative performance of memory function implementations on a particular machine.

It will also be use to track implementations performances over time.
It relies on two tools:
- `libc-benchmark-main` a C++ benchmarking utility producing raw measurements,
- `libc-benchmark-analysis.py3` a tool to process the measurements into reports.

## Quick start
## Benchmarking tool

### Setup

**Python 2** [being deprecated](https://www.python.org/doc/sunset-python-2/) it is
advised to used **Python 3**.

Then make sure to have `matplotlib`, `scipy` and `numpy` setup correctly:

```shell
apt-get install python3-pip
pip3 install matplotlib scipy numpy
cd llvm-project
cmake -B/tmp/build -Sllvm -DLLVM_ENABLE_PROJECTS='clang;clang-tools-extra;libc' -DCMAKE_BUILD_TYPE=Release -G Ninja
ninja -C /tmp/build libc-benchmark-main
```
You may need `python3-gtk` or similar package for displaying benchmark results.

To get good reproducibility it is important to make sure that the system runs in
`performance` mode. This is achieved by running:

> Note: The machine should run in `performance` mode. This is achieved by running:
```shell
cpupower frequency-set --governor performance
```

### Run and display `memcpy` benchmark
### Usage

The following commands will run the benchmark and display a 95 percentile
confidence interval curve of **time per copied bytes**. It also features **host
informations** and **benchmarking configuration**.
`libc-benchmark-main` can run in two modes:
- **stochastic mode** returns the average time per call for a particular size distribution,
- **sweep mode** returns the average time per size over a range of sizes.

```shell
cd llvm-project
cmake -B/tmp/build -Sllvm -DLLVM_ENABLE_PROJECTS='clang;clang-tools-extra;libc' -DCMAKE_BUILD_TYPE=Release -G Ninja
ninja -C /tmp/build display-libc-memcpy-benchmark-small
```
The tool requires the following flags to be set:
- `--study-name`: a name to identify a run and provide label during analysis,
- `--function`: the name of the function under test.

The display target will attempt to open a window on the machine where you're
running the benchmark. If this may not work for you then you may want `render`
or `run` instead as detailed below.
It also provides optional flags:
- `--num-trials`: repeats the benchmark more times, the analysis tool can take this into account and give confidence intervals.
- `--output`: specifies a file to write the report - or standard output if not set.
- `--aligned-access`: The alignment to use when accessing the buffers, default is unaligned, 0 disables address randomization.

## Benchmarking targets
> Note: `--function` takes a generic function name like `memcpy` or `memset` but the actual function being tested is the llvm-libc implementation (e.g. `__llvm_libc::memcpy`).
The benchmarking process occurs in two steps:
### Stochastic mode

1. Benchmark the functions and produce a `json` file
2. Display (or renders) the `json` file
This is the preferred mode to use. The function parameters are randomized and the branch predictor is less likely to kick in.

Targets are of the form `<action>-libc-<function>-benchmark-<configuration>`
```shell
/tmp/build/bin/libc-benchmark-main \
--study-name="new memcpy" \
--function=memcpy \
--size-distribution-name="memcpy Google A" \
--num-trials=30 \
--output=/tmp/benchmark_result.json
```

- `action` is one of :
- `run`, runs the benchmark and writes the `json` file
- `display`, displays the graph on screen
- `render`, renders the graph on disk as a `png` file
- `function` is one of : `memcpy`, `memcmp`, `memset`
- `configuration` is one of : `small`, `big`
The `--size-distribution-name` flag is mandatory and points to one of the [predefined distribution](libc/benchmarks/MemorySizeDistributions.h).

## Benchmarking regimes
> Note: These distributions are gathered from several important binaries at Google (servers, databases, realtime and batch jobs) and reflect the importance of focusing on small sizes.
Using a profiler to observe size distributions for calls into libc functions, it
was found most operations act on a small number of bytes.
Expand All @@ -70,37 +64,48 @@ memcpy | 96% | 99%
memset | 91% | 99.9%
memcmp<sup>1</sup> | 99.5% | ~100%

Benchmarking configurations come in two flavors:

- [small](libc/utils/benchmarks/configuration_small.json)
- Exercises sizes up to `1KiB`, representative of normal usage
- The data is kept in the `L1` cache to prevent measuring the memory
subsystem
- [big](libc/utils/benchmarks/configuration_big.json)
- Exercises sizes up to `32MiB` to test large operations
- Caching effects can show up here which prevents comparing different hosts

_<sup>1</sup> - The size refers to the size of the buffers to compare and not
the number of bytes until the first difference._

## Superposing curves
### Sweep mode

This mode is used to measure call latency per size for a certain range of sizes. Because it exercises the same size over and over again the branch predictor can kick in. It can still be useful to compare strength and weaknesses of particular implementations.

```shell
/tmp/build/bin/libc-benchmark-main \
--study-name="new memcpy" \
--function=memcpy \
--sweep-mode \
--sweep-max-size=128 \
--output=/tmp/benchmark_result.json
```

## Analysis tool

It is possible to **merge** several `json` files into a single graph. This is
useful to **compare** implementations.
### Setup

Make sure to have `matplotlib`, `pandas` and `seaborn` setup correctly:

```shell
apt-get install python3-pip
pip3 install matplotlib pandas seaborn
```
You may need `python3-gtk` or similar package to display the graphs.

In the following example we superpose the curves for `memcpy`, `memset` and
`memcmp`:
### Usage

```shell
> make -C /tmp/build run-libc-memcpy-benchmark-small run-libc-memcmp-benchmark-small run-libc-memset-benchmark-small
> python libc/utils/benchmarks/render.py3 /tmp/last-libc-memcpy-benchmark-small.json /tmp/last-libc-memcmp-benchmark-small.json /tmp/last-libc-memset-benchmark-small.json
python3 libc/benchmarks/libc-benchmark-analysis.py3 /tmp/benchmark_result.json ...
```

## Useful `render.py3` flags
When used with __multiple trials Sweep Mode data__ the tool displays the 95% confidence interval.

- To save the produced graph `--output=/tmp/benchmark_curve.png`.
- To prevent the graph from appearing on the screen `--headless`.
When providing with multiple reports at the same time, all the graphs from the same machine are displayed side by side to allow for comparison.

The Y-axis unit can be changed via the `--mode` flag:
- `time` displays the measured time (this is the default),
- `cycles` displays the number of cycles computed from the cpu frequency,
- `bytespercycle` displays the number of bytes per cycle (for `Sweep Mode` reports only).

## Under the hood

Expand Down
24 changes: 0 additions & 24 deletions libc/benchmarks/configuration_big.json

This file was deleted.

24 changes: 0 additions & 24 deletions libc/benchmarks/configuration_small.json

This file was deleted.

128 changes: 128 additions & 0 deletions libc/benchmarks/libc-benchmark-analysis.py3
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Reads JSON files produced by the benchmarking framework and renders them.
Installation:
> apt-get install python3-pip
> pip3 install matplotlib pandas seaborn
Run:
> python3 libc/benchmarks/libc-benchmark-analysis.py3 <files>
"""

import argparse
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import EngFormatter

def formatUnit(value, unit):
return EngFormatter(unit, sep="").format_data(value)

def formatCache(cache):
letter = cache["Type"][0].lower()
level = cache["Level"]
size = formatUnit(cache["Size"], "B")
ways = cache["NumSharing"]
return F'{letter}L{level}:{size}/{ways}'

def getCpuFrequency(study):
return study["Runtime"]["Host"]["CpuFrequency"]

def getId(study):
CpuName = study["Runtime"]["Host"]["CpuName"]
CpuFrequency = formatUnit(getCpuFrequency(study), "Hz")
Mode = " (Sweep)" if study["Configuration"]["IsSweepMode"] else ""
CpuCaches = ", ".join(formatCache(c) for c in study["Runtime"]["Host"]["Caches"])
return F'{CpuName} {CpuFrequency}{Mode}\n{CpuCaches}'

def getFunction(study):
return study["Configuration"]["Function"]

def getLabel(study):
return F'{getFunction(study)} {study["StudyName"]}'

def displaySweepData(id, studies, mode):
df = None
for study in studies:
Measurements = study["Measurements"]
SweepModeMaxSize = study["Configuration"]["SweepModeMaxSize"]
NumSizes = SweepModeMaxSize + 1
NumTrials = study["Configuration"]["NumTrials"]
assert NumTrials * NumSizes == len(Measurements), 'not a multiple of NumSizes'
Index = pd.MultiIndex.from_product([range(NumSizes), range(NumTrials)], names=['size', 'trial'])
if df is None:
df = pd.DataFrame(Measurements, index=Index, columns=[getLabel(study)])
else:
df[getLabel(study)] = pd.Series(Measurements, index=Index)
df = df.reset_index(level='trial', drop=True)
if mode == "cycles":
df *= getCpuFrequency(study)
if mode == "bytespercycle":
df *= getCpuFrequency(study)
for col in df.columns:
df[col] = pd.Series(data=df.index, index=df.index).divide(df[col])
FormatterUnit = {"time":"s","cycles":"","bytespercycle":"B/cycle"}[mode]
Label = {"time":"Time","cycles":"Cycles","bytespercycle":"Byte/cycle"}[mode]
graph = sns.lineplot(data=df, palette="muted", ci=95)
graph.set_title(id)
graph.yaxis.set_major_formatter(EngFormatter(unit=FormatterUnit))
graph.yaxis.set_label_text(Label)
graph.xaxis.set_major_formatter(EngFormatter(unit="B"))
graph.xaxis.set_label_text("Copy Size")
_ = plt.xticks(rotation=90)
plt.show()

def displayDistributionData(id, studies, mode):
distributions = set()
df = None
for study in studies:
distribution = study["Configuration"]["SizeDistributionName"]
distributions.add(distribution)
local = pd.DataFrame(study["Measurements"], columns=["time"])
local["distribution"] = distribution
local["label"] = getLabel(study)
local["cycles"] = local["time"] * getCpuFrequency(study)
if df is None:
df = local
else:
df = df.append(local)
if mode == "bytespercycle":
mode = "time"
print("`--mode=bytespercycle` is ignored for distribution mode reports")
FormatterUnit = {"time":"s","cycles":""}[mode]
Label = {"time":"Time","cycles":"Cycles"}[mode]
graph = sns.violinplot(data=df, x="distribution", y=mode, palette="muted", hue="label", order=sorted(distributions))
graph.set_title(id)
graph.yaxis.set_major_formatter(EngFormatter(unit=FormatterUnit))
graph.yaxis.set_label_text(Label)
_ = plt.xticks(rotation=90)
plt.show()


def main():
parser = argparse.ArgumentParser(description="Process benchmark json files.")
parser.add_argument("--mode", choices=["time", "cycles", "bytespercycle"], default="time", help="Use to display either 'time', 'cycles' or 'bytes/cycle'.")
parser.add_argument("files", nargs="+", help="The json files to read from.")

args = parser.parse_args()
study_groups = dict()
for file in args.files:
with open(file) as json_file:
json_obj = json.load(json_file)
Id = getId(json_obj)
if Id in study_groups:
study_groups[Id].append(json_obj)
else:
study_groups[Id] = [json_obj]

plt.tight_layout()
sns.set_theme(style="ticks")
for id, study_collection in study_groups.items():
if "(Sweep)" in id:
displaySweepData(id, study_collection, args.mode)
else:
displayDistributionData(id, study_collection, args.mode)


if __name__ == "__main__":
main()
194 changes: 0 additions & 194 deletions libc/benchmarks/render.py3

This file was deleted.