Merge pull request #9877 from tvegas1/gtest_cache_bar1_free

GTEST/COMMON: Cache CUDA device BAR1 available size
openucx · May 19, 2024 · 407ab27 · 407ab27
2 parents d593201 + 6b84982
commit 407ab27
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 21 deletions.
diff --git a/contrib/lsan.supp b/contrib/lsan.supp
@@ -1 +1,2 @@
 leak:libcuda
+leak:nvmlInitWithFlags
diff --git a/test/gtest/common/main.cc b/test/gtest/common/main.cc
@@ -100,6 +100,7 @@ int main(int argc, char **argv) {
 
     /* set gpu context for tests that need it */
     mem_buffer::set_device_context();
+    mem_buffer::get_bar1_free_size_nvml();
 
     int ret;
     ret = ucs::watchdog_start();

diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc
@@ -196,38 +196,29 @@ void mem_buffer::set_device_context()
     device_set = true;
 }
 
-size_t mem_buffer::get_bar1_free_size()
-{
-    /* All gtest CUDA tests explicitly assume that all memory allocations are
-     * done on the device 0. The same assumption is followed here. */
-    size_t available_size = SIZE_MAX;
+size_t mem_buffer::m_bar1_free_size = SIZE_MAX;
 
+void mem_buffer::get_bar1_free_size_nvml()
+{
 #if HAVE_CUDA
     nvmlDevice_t device;
     nvmlBAR1Memory_t bar1mem;
 
     if (NVML_CALL(nvmlInit_v2()) != UCS_OK) {
-        return available_size;
-    }
-
-    if (NVML_CALL(nvmlDeviceGetHandleByIndex(0, &device)) != UCS_OK) {
-        /* For whatever reason we cannot open device handle.
-         * As a result let's assume there is no limit on the size
-         * and in the worse case scenario gtest will fail in runtime */
-        return available_size;
+        return;
     }
 
-    if (NVML_CALL(nvmlDeviceGetBAR1MemoryInfo(device, &bar1mem)) != UCS_OK) {
-        /* Similarly let's assume there is no limit on the size */
-        return available_size;
+    /* Assume no size limit in case of failure, in the worst case scenario
+     * gtest will fail in runtime */
+    if (NVML_CALL(nvmlDeviceGetHandleByIndex(0, &device)) == UCS_OK) {
+        if (NVML_CALL(nvmlDeviceGetBAR1MemoryInfo(device, &bar1mem)) ==
+            UCS_OK) {
+            mem_buffer::m_bar1_free_size = (size_t)bar1mem.bar1Free;
+        }
     }
 
-    available_size = (size_t)bar1mem.bar1Free;
-
     NVML_CALL(nvmlShutdown());
 #endif
-
-    return available_size;
 }
 
 void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type)

diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h
@@ -93,9 +93,15 @@ class mem_buffer {
     /* returns whether ROCM device supports hipMallocPitch */
     static bool is_rocm_malloc_pitch_supported();
 
+    /* Get from NVML BAR1 free size */
+    static void get_bar1_free_size_nvml();
+
     /* Return free memory on the BAR1 / GPU. If GPU is not used
      * SIZE_MAX is returned */
-    static size_t get_bar1_free_size();
+    static size_t get_bar1_free_size()
+    {
+        return m_bar1_free_size;
+    }
 
     mem_buffer(size_t size, ucs_memory_type_t mem_type);
     mem_buffer(size_t size, ucs_memory_type_t mem_type, uint64_t seed);
@@ -144,6 +150,8 @@ class mem_buffer {
                                 ucs_memory_type_t src_mem_type,
                                 const uint64_t mem_types);
 
+    static size_t           m_bar1_free_size;
+
     const ucs_memory_type_t m_mem_type;
     void * const            m_ptr;
     const size_t            m_size;