-
Notifications
You must be signed in to change notification settings - Fork 11.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PR for adding microbenchmarking infrastructure for NVPTX. `nvlink` cannot perform LTO, so we cannot inline `libc` functions and this function call overhead is not adjusted for during microbenchmarking.
- Loading branch information
1 parent
49e5cd2
commit 02b57de
Showing
17 changed files
with
644 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#include "benchmarks/gpu/BenchmarkLogger.h" | ||
#include "src/__support/CPP/string.h" | ||
#include "src/__support/CPP/string_view.h" | ||
#include "src/__support/OSUtil/io.h" // write_to_stderr | ||
#include "src/__support/big_int.h" // is_big_int | ||
#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128 | ||
#include "src/__support/uint128.h" | ||
|
||
#include <stdint.h> | ||
|
||
namespace LIBC_NAMESPACE { | ||
namespace benchmarks { | ||
|
||
// cpp::string_view specialization | ||
template <> | ||
BenchmarkLogger & | ||
BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) { | ||
LIBC_NAMESPACE::write_to_stderr(str); | ||
return *this; | ||
} | ||
|
||
// cpp::string specialization | ||
template <> | ||
BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) { | ||
return *this << static_cast<cpp::string_view>(str); | ||
} | ||
|
||
// const char* specialization | ||
template <> | ||
BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) { | ||
return *this << cpp::string_view(str); | ||
} | ||
|
||
// char* specialization | ||
template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) { | ||
return *this << cpp::string_view(str); | ||
} | ||
|
||
// char specialization | ||
template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) { | ||
return *this << cpp::string_view(&ch, 1); | ||
} | ||
|
||
// bool specialization | ||
template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) { | ||
return *this << (cond ? "true" : "false"); | ||
} | ||
|
||
// void * specialization | ||
template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) { | ||
return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr)); | ||
} | ||
|
||
template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) { | ||
if constexpr (is_big_int_v<T> || | ||
(cpp::is_integral_v<T> && cpp::is_unsigned_v<T> && | ||
(sizeof(T) > sizeof(uint64_t)))) { | ||
static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt"); | ||
const IntegerToString<T, radix::Hex::WithPrefix> buffer(t); | ||
return *this << buffer.view(); | ||
} else { | ||
return *this << cpp::to_string(t); | ||
} | ||
} | ||
|
||
// is_integral specializations | ||
// char is already specialized to handle character | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned char>(unsigned char); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned short>(unsigned short); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned int>(unsigned int); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned long>(unsigned long); | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <unsigned long long>(unsigned long long); | ||
|
||
#ifdef LIBC_TYPES_HAS_INT128 | ||
template BenchmarkLogger & | ||
BenchmarkLogger::operator<< <__uint128_t>(__uint128_t); | ||
#endif // LIBC_TYPES_HAS_INT128 | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>); | ||
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>); | ||
|
||
// TODO: Add floating point formatting once it's supported by StringStream. | ||
|
||
BenchmarkLogger log; | ||
|
||
} // namespace benchmarks | ||
} // namespace LIBC_NAMESPACE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
//===-- Utilities to log to standard output during tests --------*- C++ -*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H | ||
#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H | ||
|
||
namespace LIBC_NAMESPACE { | ||
namespace benchmarks { | ||
|
||
// A class to log to standard output in the context of hermetic tests. | ||
struct BenchmarkLogger { | ||
constexpr BenchmarkLogger() = default; | ||
template <typename T> BenchmarkLogger &operator<<(T); | ||
}; | ||
|
||
// A global TestLogger instance to be used in tests. | ||
extern BenchmarkLogger log; | ||
|
||
} // namespace benchmarks | ||
} // namespace LIBC_NAMESPACE | ||
|
||
#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
add_subdirectory(timing) | ||
|
||
add_custom_target(gpu-benchmark) | ||
|
||
function(add_benchmark benchmark_name) | ||
cmake_parse_arguments( | ||
"BENCHMARK" | ||
"" # Optional arguments | ||
"" # Single value arguments | ||
"LINK_LIBRARIES" # Multi-value arguments | ||
${ARGN} | ||
) | ||
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) | ||
message(FATAL_ERROR "target does not support clock") | ||
endif() | ||
add_libc_hermetic( | ||
${benchmark_name} | ||
IS_BENCHMARK | ||
LINK_LIBRARIES | ||
LibcGpuBenchmark.hermetic | ||
${BENCHMARK_LINK_LIBRARIES} | ||
${BENCHMARK_UNPARSED_ARGUMENTS} | ||
) | ||
get_fq_target_name(${benchmark_name} fq_target_name) | ||
add_dependencies(gpu-benchmark ${fq_target_name}) | ||
endfunction(add_benchmark) | ||
|
||
add_unittest_framework_library( | ||
LibcGpuBenchmark | ||
SRCS | ||
LibcGpuBenchmark.cpp | ||
LibcGpuBenchmarkMain.cpp | ||
BenchmarkLogger.cpp | ||
HDRS | ||
LibcGpuBenchmark.h | ||
BenchmarkLogger.h | ||
DEPENDS | ||
libc.src.__support.big_int | ||
libc.src.__support.c_string | ||
libc.src.__support.CPP.string | ||
libc.src.__support.CPP.string_view | ||
libc.src.__support.CPP.type_traits | ||
libc.src.__support.CPP.functional | ||
libc.src.__support.CPP.limits | ||
libc.src.__support.CPP.algorithm | ||
libc.src.__support.fixed_point.fx_rep | ||
libc.src.__support.macros.properties.types | ||
libc.src.__support.OSUtil.osutil | ||
libc.src.__support.uint128 | ||
libc.src.__support.FPUtil.sqrt | ||
libc.src.__support.fixedvector | ||
libc.src.time.clock | ||
libc.benchmarks.gpu.timing.timing | ||
) | ||
|
||
add_subdirectory(src) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
#include "LibcGpuBenchmark.h" | ||
#include "src/__support/CPP/algorithm.h" | ||
#include "src/__support/CPP/array.h" | ||
#include "src/__support/CPP/string.h" | ||
#include "src/__support/FPUtil/sqrt.h" | ||
#include "src/__support/GPU/utils.h" | ||
#include "src/__support/fixedvector.h" | ||
#include "src/time/gpu/time_utils.h" | ||
|
||
namespace LIBC_NAMESPACE { | ||
namespace benchmarks { | ||
|
||
FixedVector<Benchmark *, 64> benchmarks; | ||
cpp::array<BenchmarkResult, 1024> results; | ||
|
||
void Benchmark::add_benchmark(Benchmark *benchmark) { | ||
benchmarks.push_back(benchmark); | ||
} | ||
|
||
BenchmarkResult reduce_results(cpp::array<BenchmarkResult, 1024> &results) { | ||
BenchmarkResult result; | ||
uint64_t cycles_sum = 0; | ||
double standard_deviation_sum = 0; | ||
uint64_t min = UINT64_MAX; | ||
uint64_t max = 0; | ||
uint32_t samples_sum = 0; | ||
uint32_t iterations_sum = 0; | ||
clock_t time_sum = 0; | ||
uint64_t num_threads = gpu::get_num_threads(); | ||
for (uint64_t i = 0; i < num_threads; i++) { | ||
BenchmarkResult current_result = results[i]; | ||
cycles_sum += current_result.cycles; | ||
standard_deviation_sum += current_result.standard_deviation; | ||
min = cpp::min(min, current_result.min); | ||
max = cpp::max(max, current_result.max); | ||
samples_sum += current_result.samples; | ||
iterations_sum += current_result.total_iterations; | ||
time_sum += current_result.total_time; | ||
} | ||
result.cycles = cycles_sum / num_threads; | ||
result.standard_deviation = standard_deviation_sum / num_threads; | ||
result.min = min; | ||
result.max = max; | ||
result.samples = samples_sum / num_threads; | ||
result.total_iterations = iterations_sum / num_threads; | ||
result.total_time = time_sum / num_threads; | ||
return result; | ||
} | ||
|
||
void Benchmark::run_benchmarks() { | ||
uint64_t id = gpu::get_thread_id(); | ||
gpu::sync_threads(); | ||
|
||
for (Benchmark *benchmark : benchmarks) | ||
results[id] = benchmark->run(); | ||
gpu::sync_threads(); | ||
if (id == 0) { | ||
for (Benchmark *benchmark : benchmarks) { | ||
BenchmarkResult all_results = reduce_results(results); | ||
constexpr auto GREEN = "\033[32m"; | ||
constexpr auto RESET = "\033[0m"; | ||
log << GREEN << "[ RUN ] " << RESET << benchmark->get_name() << '\n'; | ||
log << GREEN << "[ OK ] " << RESET << benchmark->get_name() << ": " | ||
<< all_results.cycles << " cycles, " << all_results.min << " min, " | ||
<< all_results.max << " max, " << all_results.total_iterations | ||
<< " iterations, " << all_results.total_time << " ns, " | ||
<< static_cast<long>(all_results.standard_deviation) << " stddev\n"; | ||
} | ||
} | ||
gpu::sync_threads(); | ||
} | ||
|
||
BenchmarkResult benchmark(const BenchmarkOptions &options, | ||
cpp::function<uint64_t(void)> wrapper_func) { | ||
BenchmarkResult result; | ||
RuntimeEstimationProgression rep; | ||
uint32_t total_iterations = 0; | ||
uint32_t iterations = options.initial_iterations; | ||
if (iterations < 1u) | ||
iterations = 1; | ||
|
||
uint32_t samples = 0; | ||
uint64_t total_time = 0; | ||
uint64_t best_guess = 0; | ||
uint64_t total_cycles = 0; | ||
uint64_t cycles_squared = 0; | ||
uint64_t min = UINT64_MAX; | ||
uint64_t max = 0; | ||
|
||
uint64_t overhead = UINT64_MAX; | ||
int overhead_iterations = 10; | ||
for (int i = 0; i < overhead_iterations; i++) | ||
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead()); | ||
|
||
for (uint64_t time_budget = options.max_duration; time_budget >= 0;) { | ||
uint64_t sample_cycles = 0; | ||
const clock_t start = static_cast<double>(clock()); | ||
for (uint32_t i = 0; i < iterations; i++) { | ||
auto wrapper_intermediate = wrapper_func(); | ||
uint64_t result = wrapper_intermediate - overhead; | ||
max = cpp::max(max, result); | ||
min = cpp::min(min, result); | ||
sample_cycles += result; | ||
} | ||
const clock_t end = clock(); | ||
const clock_t duration_ns = | ||
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; | ||
total_time += duration_ns; | ||
time_budget -= duration_ns; | ||
samples++; | ||
total_cycles += sample_cycles; | ||
cycles_squared += sample_cycles * sample_cycles; | ||
|
||
total_iterations += iterations; | ||
const double change_ratio = | ||
rep.compute_improvement({iterations, sample_cycles}); | ||
best_guess = rep.current_estimation; | ||
|
||
if (samples >= options.max_samples || iterations >= options.max_iterations) | ||
break; | ||
if (total_time >= options.min_duration && samples >= options.min_samples && | ||
change_ratio < options.epsilon) | ||
break; | ||
|
||
iterations *= options.scaling_factor; | ||
} | ||
result.cycles = best_guess; | ||
result.standard_deviation = fputil::sqrt<double>( | ||
static_cast<double>(cycles_squared) / total_iterations - | ||
static_cast<double>(best_guess * best_guess)); | ||
result.min = min; | ||
result.max = max; | ||
result.samples = samples; | ||
result.total_iterations = total_iterations; | ||
result.total_time = total_time; | ||
return result; | ||
}; | ||
|
||
} // namespace benchmarks | ||
} // namespace LIBC_NAMESPACE |
Oops, something went wrong.