Skip to content

Commit

Permalink
[libc] NVPTX Profiling (#92009)
Browse files Browse the repository at this point in the history
PR for adding microbenchmarking infrastructure for NVPTX. `nvlink`
cannot perform LTO, so we cannot inline `libc` functions and this
function call overhead is not adjusted for during microbenchmarking.
  • Loading branch information
jameshu15869 committed Jun 26, 2024
1 parent 49e5cd2 commit 02b57de
Show file tree
Hide file tree
Showing 17 changed files with 644 additions and 11 deletions.
4 changes: 1 addition & 3 deletions libc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -401,9 +401,7 @@ if(LLVM_INCLUDE_TESTS)
add_subdirectory(fuzzing)
endif()

if(LIBC_INCLUDE_BENCHMARKS)
add_subdirectory(benchmarks)
endif()
add_subdirectory(benchmarks)

if (LIBC_INCLUDE_DOCS)
add_subdirectory(docs)
Expand Down
10 changes: 10 additions & 0 deletions libc/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
if(LIBC_TARGET_OS_IS_GPU)
add_subdirectory(gpu)
return()
endif()

# The CPU build depends on Google benchmark.
if(NOT LIBC_INCLUDE_BENCHMARKS)
return()
endif()

find_package(Threads)

set(LLVM_LINK_COMPONENTS
Expand Down
97 changes: 97 additions & 0 deletions libc/benchmarks/gpu/BenchmarkLogger.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "src/__support/CPP/string.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/OSUtil/io.h" // write_to_stderr
#include "src/__support/big_int.h" // is_big_int
#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
#include "src/__support/uint128.h"

#include <stdint.h>

namespace LIBC_NAMESPACE {
namespace benchmarks {

// cpp::string_view specialization
template <>
BenchmarkLogger &
BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
LIBC_NAMESPACE::write_to_stderr(str);
return *this;
}

// cpp::string specialization
template <>
BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
return *this << static_cast<cpp::string_view>(str);
}

// const char* specialization
template <>
BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
return *this << cpp::string_view(str);
}

// char* specialization
template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
return *this << cpp::string_view(str);
}

// char specialization
template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
return *this << cpp::string_view(&ch, 1);
}

// bool specialization
template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
return *this << (cond ? "true" : "false");
}

// void * specialization
template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
}

template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
if constexpr (is_big_int_v<T> ||
(cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
(sizeof(T) > sizeof(uint64_t)))) {
static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
return *this << buffer.view();
} else {
return *this << cpp::to_string(t);
}
}

// is_integral specializations
// char is already specialized to handle character
template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
template BenchmarkLogger &
BenchmarkLogger::operator<< <unsigned char>(unsigned char);
template BenchmarkLogger &
BenchmarkLogger::operator<< <unsigned short>(unsigned short);
template BenchmarkLogger &
BenchmarkLogger::operator<< <unsigned int>(unsigned int);
template BenchmarkLogger &
BenchmarkLogger::operator<< <unsigned long>(unsigned long);
template BenchmarkLogger &
BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);

#ifdef LIBC_TYPES_HAS_INT128
template BenchmarkLogger &
BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
#endif // LIBC_TYPES_HAS_INT128
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);

// TODO: Add floating point formatting once it's supported by StringStream.

BenchmarkLogger log;

} // namespace benchmarks
} // namespace LIBC_NAMESPACE
27 changes: 27 additions & 0 deletions libc/benchmarks/gpu/BenchmarkLogger.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H

namespace LIBC_NAMESPACE {
namespace benchmarks {

// A class to log to standard output in the context of hermetic tests.
struct BenchmarkLogger {
constexpr BenchmarkLogger() = default;
template <typename T> BenchmarkLogger &operator<<(T);
};

// A global TestLogger instance to be used in tests.
extern BenchmarkLogger log;

} // namespace benchmarks
} // namespace LIBC_NAMESPACE

#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
56 changes: 56 additions & 0 deletions libc/benchmarks/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
add_subdirectory(timing)

add_custom_target(gpu-benchmark)

function(add_benchmark benchmark_name)
cmake_parse_arguments(
"BENCHMARK"
"" # Optional arguments
"" # Single value arguments
"LINK_LIBRARIES" # Multi-value arguments
${ARGN}
)
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
message(FATAL_ERROR "target does not support clock")
endif()
add_libc_hermetic(
${benchmark_name}
IS_BENCHMARK
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
${BENCHMARK_UNPARSED_ARGUMENTS}
)
get_fq_target_name(${benchmark_name} fq_target_name)
add_dependencies(gpu-benchmark ${fq_target_name})
endfunction(add_benchmark)

add_unittest_framework_library(
LibcGpuBenchmark
SRCS
LibcGpuBenchmark.cpp
LibcGpuBenchmarkMain.cpp
BenchmarkLogger.cpp
HDRS
LibcGpuBenchmark.h
BenchmarkLogger.h
DEPENDS
libc.src.__support.big_int
libc.src.__support.c_string
libc.src.__support.CPP.string
libc.src.__support.CPP.string_view
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.functional
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
libc.benchmarks.gpu.timing.timing
)

add_subdirectory(src)
140 changes: 140 additions & 0 deletions libc/benchmarks/gpu/LibcGpuBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#include "LibcGpuBenchmark.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/fixedvector.h"
#include "src/time/gpu/time_utils.h"

namespace LIBC_NAMESPACE {
namespace benchmarks {

FixedVector<Benchmark *, 64> benchmarks;
cpp::array<BenchmarkResult, 1024> results;

void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}

BenchmarkResult reduce_results(cpp::array<BenchmarkResult, 1024> &results) {
BenchmarkResult result;
uint64_t cycles_sum = 0;
double standard_deviation_sum = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
uint32_t samples_sum = 0;
uint32_t iterations_sum = 0;
clock_t time_sum = 0;
uint64_t num_threads = gpu::get_num_threads();
for (uint64_t i = 0; i < num_threads; i++) {
BenchmarkResult current_result = results[i];
cycles_sum += current_result.cycles;
standard_deviation_sum += current_result.standard_deviation;
min = cpp::min(min, current_result.min);
max = cpp::max(max, current_result.max);
samples_sum += current_result.samples;
iterations_sum += current_result.total_iterations;
time_sum += current_result.total_time;
}
result.cycles = cycles_sum / num_threads;
result.standard_deviation = standard_deviation_sum / num_threads;
result.min = min;
result.max = max;
result.samples = samples_sum / num_threads;
result.total_iterations = iterations_sum / num_threads;
result.total_time = time_sum / num_threads;
return result;
}

void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
gpu::sync_threads();

for (Benchmark *benchmark : benchmarks)
results[id] = benchmark->run();
gpu::sync_threads();
if (id == 0) {
for (Benchmark *benchmark : benchmarks) {
BenchmarkResult all_results = reduce_results(results);
constexpr auto GREEN = "\033[32m";
constexpr auto RESET = "\033[0m";
log << GREEN << "[ RUN ] " << RESET << benchmark->get_name() << '\n';
log << GREEN << "[ OK ] " << RESET << benchmark->get_name() << ": "
<< all_results.cycles << " cycles, " << all_results.min << " min, "
<< all_results.max << " max, " << all_results.total_iterations
<< " iterations, " << all_results.total_time << " ns, "
<< static_cast<long>(all_results.standard_deviation) << " stddev\n";
}
}
gpu::sync_threads();
}

BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func) {
BenchmarkResult result;
RuntimeEstimationProgression rep;
uint32_t total_iterations = 0;
uint32_t iterations = options.initial_iterations;
if (iterations < 1u)
iterations = 1;

uint32_t samples = 0;
uint64_t total_time = 0;
uint64_t best_guess = 0;
uint64_t total_cycles = 0;
uint64_t cycles_squared = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;

uint64_t overhead = UINT64_MAX;
int overhead_iterations = 10;
for (int i = 0; i < overhead_iterations; i++)
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());

for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
uint64_t sample_cycles = 0;
const clock_t start = static_cast<double>(clock());
for (uint32_t i = 0; i < iterations; i++) {
auto wrapper_intermediate = wrapper_func();
uint64_t result = wrapper_intermediate - overhead;
max = cpp::max(max, result);
min = cpp::min(min, result);
sample_cycles += result;
}
const clock_t end = clock();
const clock_t duration_ns =
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
total_time += duration_ns;
time_budget -= duration_ns;
samples++;
total_cycles += sample_cycles;
cycles_squared += sample_cycles * sample_cycles;

total_iterations += iterations;
const double change_ratio =
rep.compute_improvement({iterations, sample_cycles});
best_guess = rep.current_estimation;

if (samples >= options.max_samples || iterations >= options.max_iterations)
break;
if (total_time >= options.min_duration && samples >= options.min_samples &&
change_ratio < options.epsilon)
break;

iterations *= options.scaling_factor;
}
result.cycles = best_guess;
result.standard_deviation = fputil::sqrt<double>(
static_cast<double>(cycles_squared) / total_iterations -
static_cast<double>(best_guess * best_guess));
result.min = min;
result.max = max;
result.samples = samples;
result.total_iterations = total_iterations;
result.total_time = total_time;
return result;
};

} // namespace benchmarks
} // namespace LIBC_NAMESPACE
Loading

0 comments on commit 02b57de

Please sign in to comment.