Skip to content

Commit

Permalink
Add a more comprehensive kokkos_{malloc, free} perf_test (kokkos#6377)
Browse files Browse the repository at this point in the history
* add kokkos_malloc / kokkos_free perf test

* Remove duplicate ViewAllocate_Raw (present in PerfTest_MallocFree)
  • Loading branch information
cwpearson committed Aug 29, 2023
1 parent d122c39 commit 43d3e53
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 34 deletions.
1 change: 1 addition & 0 deletions core/perf_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ SET(
PerfTest_CustomReduction.cpp
PerfTest_ExecSpacePartitioning.cpp
PerfTestHexGrad.cpp
PerfTest_MallocFree.cpp
PerfTest_ViewAllocate.cpp
PerfTest_ViewCopy_a123.cpp
PerfTest_ViewCopy_b123.cpp
Expand Down
100 changes: 100 additions & 0 deletions core/perf_test/PerfTest_MallocFree.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#include <Kokkos_Core.hpp>
#include <benchmark/benchmark.h>
#include "Benchmark_Context.hpp"

namespace Benchmark {

// when the time will be recorded
enum class When { after_malloc, after_touch, after_free };

static void Impl(benchmark::State& state, const bool touch, const When when) {
const size_t N = state.range(0);
for (auto _ : state) {
Kokkos::Timer timer;
char* a_ptr = static_cast<char*>(Kokkos::kokkos_malloc("A", N));
if (When::after_malloc == when) {
state.SetIterationTime(timer.seconds());
}
if (touch) {
constexpr size_t STRIDE = 1024; // stride for touching the allocation.
// this is intended to be a safe value that would touch every "page", but
// not saturate the memory bandwidth
Kokkos::parallel_for(
N / STRIDE,
KOKKOS_LAMBDA(const size_t& i) { a_ptr[i * STRIDE] = i * STRIDE; });
Kokkos::fence();
}
if (When::after_touch == when) {
state.SetIterationTime(timer.seconds());
}
Kokkos::kokkos_free(a_ptr);
if (When::after_free == when) {
state.SetIterationTime(timer.seconds());
}
}

state.counters[KokkosBenchmark::benchmark_fom("rate")] =
benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
}

static void Malloc(benchmark::State& state) {
Impl(state, false, When::after_malloc);
}

static void MallocFree(benchmark::State& state) {
Impl(state, false, When::after_free);
}

static void MallocTouch(benchmark::State& state) {
Impl(state, true, When::after_touch);
}

static void MallocTouchFree(benchmark::State& state) {
Impl(state, true, When::after_free);
}

BENCHMARK(Malloc)
->ArgName("N")
->RangeMultiplier(16)
->Range(1, int64_t(1) << 32)
->UseManualTime()
->Unit(benchmark::kMicrosecond);

BENCHMARK(MallocFree)
->ArgName("N")
->RangeMultiplier(16)
->Range(1, int64_t(1) << 32)
->UseManualTime()
->Unit(benchmark::kMicrosecond);

BENCHMARK(MallocTouch)
->ArgName("N")
->RangeMultiplier(16)
->Range(1, int64_t(1) << 32)
->UseManualTime()
->Unit(benchmark::kMicrosecond);

BENCHMARK(MallocTouchFree)
->ArgName("N")
->RangeMultiplier(16)
->Range(1, int64_t(1) << 32)
->UseManualTime()
->Unit(benchmark::kMicrosecond);

} // namespace Benchmark
34 changes: 0 additions & 34 deletions core/perf_test/PerfTest_ViewAllocate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,28 +115,6 @@ static void ViewAllocate_Rank8(benchmark::State& state) {
}
}

template <class Layout>
static void ViewAllocate_Raw(benchmark::State& state) {
const int N8 = std::pow(state.range(0), 8);
for (auto _ : state) {
Kokkos::Timer timer;
double* a_ptr =
static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
Kokkos::parallel_for(
N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
Kokkos::fence();
const auto time = timer.seconds();
Kokkos::kokkos_free(a_ptr);

state.SetIterationTime(time);
// data processed in megabytes
const double data_processed = 1 * N8 * sizeof(double) / 1'000'000;
state.counters["MB"] = benchmark::Counter(data_processed);
state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter(
data_processed / 1'000, benchmark::Counter::kIsIterationInvariantRate);
}
}

BENCHMARK(ViewAllocate_Rank1<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
Expand Down Expand Up @@ -217,16 +195,4 @@ BENCHMARK(ViewAllocate_Rank8<Kokkos::LayoutRight>)
->Arg(N)
->UseManualTime();

#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
BENCHMARK(ViewAllocate_Raw<Kokkos::LayoutLeft>)
->ArgName("N")
->Arg(N)
->UseManualTime();

BENCHMARK(ViewAllocate_Raw<Kokkos::LayoutRight>)
->ArgName("N")
->Arg(N)
->UseManualTime();
#endif

} // namespace Test

0 comments on commit 43d3e53

Please sign in to comment.