Skip to content

Commit

Permalink
Merge pull request #9877 from tvegas1/gtest_cache_bar1_free
Browse files Browse the repository at this point in the history
GTEST/COMMON: Cache CUDA device BAR1 available size
  • Loading branch information
yosefe committed May 19, 2024
2 parents d593201 + 6b84982 commit 407ab27
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 21 deletions.
1 change: 1 addition & 0 deletions contrib/lsan.supp
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
leak:libcuda
leak:nvmlInitWithFlags
1 change: 1 addition & 0 deletions test/gtest/common/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ int main(int argc, char **argv) {

/* set gpu context for tests that need it */
mem_buffer::set_device_context();
mem_buffer::get_bar1_free_size_nvml();

int ret;
ret = ucs::watchdog_start();
Expand Down
31 changes: 11 additions & 20 deletions test/gtest/common/mem_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,38 +196,29 @@ void mem_buffer::set_device_context()
device_set = true;
}

size_t mem_buffer::get_bar1_free_size()
{
/* All gtest CUDA tests explicitly assume that all memory allocations are
* done on the device 0. The same assumption is followed here. */
size_t available_size = SIZE_MAX;
size_t mem_buffer::m_bar1_free_size = SIZE_MAX;

void mem_buffer::get_bar1_free_size_nvml()
{
#if HAVE_CUDA
nvmlDevice_t device;
nvmlBAR1Memory_t bar1mem;

if (NVML_CALL(nvmlInit_v2()) != UCS_OK) {
return available_size;
}

if (NVML_CALL(nvmlDeviceGetHandleByIndex(0, &device)) != UCS_OK) {
/* For whatever reason we cannot open device handle.
* As a result let's assume there is no limit on the size
* and in the worse case scenario gtest will fail in runtime */
return available_size;
return;
}

if (NVML_CALL(nvmlDeviceGetBAR1MemoryInfo(device, &bar1mem)) != UCS_OK) {
/* Similarly let's assume there is no limit on the size */
return available_size;
/* Assume no size limit in case of failure, in the worst case scenario
* gtest will fail in runtime */
if (NVML_CALL(nvmlDeviceGetHandleByIndex(0, &device)) == UCS_OK) {
if (NVML_CALL(nvmlDeviceGetBAR1MemoryInfo(device, &bar1mem)) ==
UCS_OK) {
mem_buffer::m_bar1_free_size = (size_t)bar1mem.bar1Free;
}
}

available_size = (size_t)bar1mem.bar1Free;

NVML_CALL(nvmlShutdown());
#endif

return available_size;
}

void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type)
Expand Down
10 changes: 9 additions & 1 deletion test/gtest/common/mem_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,15 @@ class mem_buffer {
/* returns whether ROCM device supports hipMallocPitch */
static bool is_rocm_malloc_pitch_supported();

/* Get from NVML BAR1 free size */
static void get_bar1_free_size_nvml();

/* Return free memory on the BAR1 / GPU. If GPU is not used
* SIZE_MAX is returned */
static size_t get_bar1_free_size();
static size_t get_bar1_free_size()
{
return m_bar1_free_size;
}

mem_buffer(size_t size, ucs_memory_type_t mem_type);
mem_buffer(size_t size, ucs_memory_type_t mem_type, uint64_t seed);
Expand Down Expand Up @@ -144,6 +150,8 @@ class mem_buffer {
ucs_memory_type_t src_mem_type,
const uint64_t mem_types);

static size_t m_bar1_free_size;

const ucs_memory_type_t m_mem_type;
void * const m_ptr;
const size_t m_size;
Expand Down

0 comments on commit 407ab27

Please sign in to comment.