From 6d1221e0f0bd2e8054efa23c4cd8bc5e19ab2894 Mon Sep 17 00:00:00 2001
From: zdevito <zdevito@meta.com>
Date: Tue, 18 Nov 2025 14:55:16 -0800
Subject: [PATCH 1/5] Remove direct dependency on libcuda

Dynamically load  any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one.

Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/)

[ghstack-poisoned]
---
 cuda-sys/build.rs                      |  11 +-
 cuda-sys/src/lib.rs                    |  16 --
 cuda-sys/src/wrapper.h                 |   1 -
 monarch_rdma/src/macros.rs             |   4 +-
 monarch_rdma/src/rdma_manager_actor.rs |  16 +-
 monarch_rdma/src/test_utils.rs         | 100 ++++++------
 rdmaxcel-sys/build.rs                  |  23 ++-
 rdmaxcel-sys/src/driver_api.cpp        | 204 +++++++++++++++++++++++++
 rdmaxcel-sys/src/driver_api.h          |  95 ++++++++++++
 rdmaxcel-sys/src/rdmaxcel.cpp          |  15 +-
 rdmaxcel-sys/src/rdmaxcel.h            |   1 +
 11 files changed, 393 insertions(+), 93 deletions(-)
 create mode 100644 rdmaxcel-sys/src/driver_api.cpp
 create mode 100644 rdmaxcel-sys/src/driver_api.h
diff --git a/cuda-sys/build.rs b/cuda-sys/build.rs
index 663a4197b..22b8b4675 100644
--- a/cuda-sys/build.rs
+++ b/cuda-sys/build.rs
@@ -31,11 +31,11 @@ fn main() {
         .clang_arg("c++")
         .clang_arg("-std=gnu++20")
         .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
-        // Allow the specified functions and types
-        .allowlist_function("cu.*")
-        .allowlist_function("CU.*")
-        .allowlist_type("cu.*")
-        .allowlist_type("CU.*")
+        // Allow the specified functions and types (CUDA Runtime API only)
+        .allowlist_function("cuda.*")
+        .allowlist_function("CUDA.*")
+        .allowlist_type("cuda.*")
+        .allowlist_type("CUDA.*")
         // Use newtype enum style
         .default_enum_style(bindgen::EnumVariation::NewType {
             is_bitfield: false,
@@ -78,7 +78,6 @@ fn main() {
         }
     };
     println!("cargo:rustc-link-search=native={}", cuda_lib_dir);
-    println!("cargo:rustc-link-lib=cuda");
     println!("cargo:rustc-link-lib=cudart");
 
     // Generate bindings - fail fast if this doesn't work
diff --git a/cuda-sys/src/lib.rs b/cuda-sys/src/lib.rs
index 2fc1d05ba..64ed70c64 100644
--- a/cuda-sys/src/lib.rs
+++ b/cuda-sys/src/lib.rs
@@ -35,19 +35,3 @@ mod inner {
 
 pub use inner::*;
 
-#[cfg(test)]
-mod tests {
-    use std::mem::MaybeUninit;
-
-    use super::*;
-
-    #[test]
-    fn sanity() {
-        // SAFETY: testing bindings
-        unsafe {
-            let mut version = MaybeUninit::<i32>::uninit();
-            let result = cuDriverGetVersion(version.as_mut_ptr());
-            assert_eq!(result, cudaError_enum(0));
-        }
-    }
-}
diff --git a/cuda-sys/src/wrapper.h b/cuda-sys/src/wrapper.h
index 04b61ecf7..02b4028d6 100644
--- a/cuda-sys/src/wrapper.h
+++ b/cuda-sys/src/wrapper.h
@@ -8,5 +8,4 @@
 
 #pragma once
 
-#include <cuda.h>
 #include <cuda_runtime.h>
diff --git a/monarch_rdma/src/macros.rs b/monarch_rdma/src/macros.rs
index 785d13924..097ac9b60 100644
--- a/monarch_rdma/src/macros.rs
+++ b/monarch_rdma/src/macros.rs
@@ -9,9 +9,9 @@
 #[macro_export]
 macro_rules! cu_check {
     ($result:expr) => {
-        if $result != cuda_sys::CUresult::CUDA_SUCCESS {
+        if $result != rdmaxcel_sys::CUDA_SUCCESS {
             let mut error_string: *const std::os::raw::c_char = std::ptr::null();
-            cuda_sys::cuGetErrorString($result, &mut error_string);
+            rdmaxcel_sys::rdmaxcel_cuGetErrorString($result, &mut error_string);
             panic!(
                 "cuda failure {}:{} {:?} '{}'",
                 file!(),
diff --git a/monarch_rdma/src/rdma_manager_actor.rs b/monarch_rdma/src/rdma_manager_actor.rs
index 2af05fa79..2083d3fa3 100644
--- a/monarch_rdma/src/rdma_manager_actor.rs
+++ b/monarch_rdma/src/rdma_manager_actor.rs
@@ -366,13 +366,13 @@ impl RdmaManagerActor {
     ) -> Result<(RdmaMemoryRegionView, String), anyhow::Error> {
         unsafe {
             let mut mem_type: i32 = 0;
-            let ptr = addr as cuda_sys::CUdeviceptr;
-            let err = cuda_sys::cuPointerGetAttribute(
+            let ptr = addr as rdmaxcel_sys::CUdeviceptr;
+            let err = rdmaxcel_sys::rdmaxcel_cuPointerGetAttribute(
                 &mut mem_type as *mut _ as *mut std::ffi::c_void,
-                cuda_sys::CUpointer_attribute_enum::CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                rdmaxcel_sys::CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
                 ptr,
             );
-            let is_cuda = err == cuda_sys::CUresult::CUDA_SUCCESS;
+            let is_cuda = err == rdmaxcel_sys::CUDA_SUCCESS;
 
             let mut selected_rdma_device = None;
 
@@ -457,11 +457,11 @@ impl RdmaManagerActor {
                 mrv = maybe_mrv.unwrap();
             } else if is_cuda {
                 let mut fd: i32 = -1;
-                cuda_sys::cuMemGetHandleForAddressRange(
-                    &mut fd as *mut i32 as *mut std::ffi::c_void,
-                    addr as cuda_sys::CUdeviceptr,
+                rdmaxcel_sys::rdmaxcel_cuMemGetHandleForAddressRange(
+                    &mut fd,
+                    addr as rdmaxcel_sys::CUdeviceptr,
                     size,
-                    cuda_sys::CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                    rdmaxcel_sys::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
                     0,
                 );
                 mr = rdmaxcel_sys::ibv_reg_dmabuf_mr(domain_pd, 0, size, 0, fd, access.0 as i32);
diff --git a/monarch_rdma/src/test_utils.rs b/monarch_rdma/src/test_utils.rs
index 113ad9a60..b1407761e 100644
--- a/monarch_rdma/src/test_utils.rs
+++ b/monarch_rdma/src/test_utils.rs
@@ -46,25 +46,25 @@ pub fn is_cuda_available() -> bool {
 fn check_cuda_available() -> bool {
     unsafe {
         // Try to initialize CUDA
-        let result = cuda_sys::cuInit(0);
+        let result = rdmaxcel_sys::rdmaxcel_cuInit(0);
 
-        if result != cuda_sys::CUresult::CUDA_SUCCESS {
+        if result != rdmaxcel_sys::CUDA_SUCCESS {
             return false;
         }
 
         // Check if there are any CUDA devices
         let mut device_count: i32 = 0;
-        let count_result = cuda_sys::cuDeviceGetCount(&mut device_count);
+        let count_result = rdmaxcel_sys::rdmaxcel_cuDeviceGetCount(&mut device_count);
 
-        if count_result != cuda_sys::CUresult::CUDA_SUCCESS || device_count <= 0 {
+        if count_result != rdmaxcel_sys::CUDA_SUCCESS || device_count <= 0 {
             return false;
         }
 
         // Try to get the first device to verify it's actually accessible
-        let mut device: cuda_sys::CUdevice = std::mem::zeroed();
-        let device_result = cuda_sys::cuDeviceGet(&mut device, 0);
+        let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed();
+        let device_result = rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, 0);
 
-        if device_result != cuda_sys::CUresult::CUDA_SUCCESS {
+        if device_result != rdmaxcel_sys::CUDA_SUCCESS {
             return false;
         }
 
@@ -270,8 +270,8 @@ pub mod test_utils {
         pub actor_2: ActorRef<RdmaManagerActor>,
         pub rdma_handle_1: RdmaBuffer,
         pub rdma_handle_2: RdmaBuffer,
-        cuda_context_1: Option<cuda_sys::CUcontext>,
-        cuda_context_2: Option<cuda_sys::CUcontext>,
+        cuda_context_1: Option<rdmaxcel_sys::CUcontext>,
+        cuda_context_2: Option<rdmaxcel_sys::CUcontext>,
     }
 
     #[derive(Debug, Clone)]
@@ -375,46 +375,46 @@ pub mod test_utils {
                 }
                 // CUDA case
                 unsafe {
-                    cu_check!(cuda_sys::cuInit(0));
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuInit(0));
 
-                    let mut dptr: cuda_sys::CUdeviceptr = std::mem::zeroed();
-                    let mut handle: cuda_sys::CUmemGenericAllocationHandle = std::mem::zeroed();
+                    let mut dptr: rdmaxcel_sys::CUdeviceptr = std::mem::zeroed();
+                    let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed();
 
-                    let mut device: cuda_sys::CUdevice = std::mem::zeroed();
-                    cu_check!(cuda_sys::cuDeviceGet(&mut device, accel.1 as i32));
+                    let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed();
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, accel.1 as i32));
 
-                    let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-                    cu_check!(cuda_sys::cuCtxCreate_v2(&mut context, 0, accel.1 as i32));
-                    cu_check!(cuda_sys::cuCtxSetCurrent(context));
+                    let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(&mut context, 0, accel.1 as i32));
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
 
                     let mut granularity: usize = 0;
-                    let mut prop: cuda_sys::CUmemAllocationProp = std::mem::zeroed();
-                    prop.type_ = cuda_sys::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED;
-                    prop.location.type_ = cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE;
+                    let mut prop: rdmaxcel_sys::CUmemAllocationProp = std::mem::zeroed();
+                    prop.type_ = rdmaxcel_sys::CU_MEM_ALLOCATION_TYPE_PINNED;
+                    prop.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
                     prop.location.id = device;
                     prop.allocFlags.gpuDirectRDMACapable = 1;
                     prop.requestedHandleTypes =
-                        cuda_sys::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+                        rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
-                    cu_check!(cuda_sys::cuMemGetAllocationGranularity(
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemGetAllocationGranularity(
                         &mut granularity as *mut usize,
                         &prop,
-                        cuda_sys::CUmemAllocationGranularity_flags::CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+                        rdmaxcel_sys::CU_MEM_ALLOC_GRANULARITY_MINIMUM,
                     ));
 
                     // ensure our size is aligned
                     let /*mut*/ padded_size: usize = ((buffer_size - 1) / granularity + 1) * granularity;
                     assert!(padded_size == buffer_size);
 
-                    cu_check!(cuda_sys::cuMemCreate(
-                        &mut handle as *mut cuda_sys::CUmemGenericAllocationHandle,
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemCreate(
+                        &mut handle as *mut rdmaxcel_sys::CUmemGenericAllocationHandle,
                         padded_size,
                         &prop,
                         0
                     ));
                     // reserve and map the memory
-                    cu_check!(cuda_sys::cuMemAddressReserve(
-                        &mut dptr as *mut cuda_sys::CUdeviceptr,
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressReserve(
+                        &mut dptr as *mut rdmaxcel_sys::CUdeviceptr,
                         padded_size,
                         0,
                         0,
@@ -424,25 +424,25 @@ pub mod test_utils {
                     assert!(padded_size.is_multiple_of(granularity));
 
                     // fails if a add cu_check macro; but passes if we don't
-                    let err = cuda_sys::cuMemMap(
-                        dptr as cuda_sys::CUdeviceptr,
+                    let err = rdmaxcel_sys::rdmaxcel_cuMemMap(
+                        dptr as rdmaxcel_sys::CUdeviceptr,
                         padded_size,
                         0,
-                        handle as cuda_sys::CUmemGenericAllocationHandle,
+                        handle as rdmaxcel_sys::CUmemGenericAllocationHandle,
                         0,
                     );
-                    if err != cuda_sys::CUresult::CUDA_SUCCESS {
+                    if err != rdmaxcel_sys::CUDA_SUCCESS {
                         panic!("failed reserving and mapping memory {:?}", err);
                     }
 
                     // set access
-                    let mut access_desc: cuda_sys::CUmemAccessDesc = std::mem::zeroed();
+                    let mut access_desc: rdmaxcel_sys::CUmemAccessDesc = std::mem::zeroed();
                     access_desc.location.type_ =
-                        cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE;
+                        rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
                     access_desc.location.id = device;
                     access_desc.flags =
-                        cuda_sys::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-                    cu_check!(cuda_sys::cuMemSetAccess(dptr, padded_size, &access_desc, 1));
+                        rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(dptr, padded_size, &access_desc, 1));
                     buf_vec.push(Buffer {
                         ptr: dptr,
                         len: padded_size,
@@ -460,11 +460,11 @@ pub mod test_utils {
                 }
                 unsafe {
                     // Use the CUDA context that was created for the first buffer
-                    cu_check!(cuda_sys::cuCtxSetCurrent(
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(
                         cuda_contexts[0].expect("No CUDA context found")
                     ));
 
-                    cu_check!(cuda_sys::cuMemcpyHtoD_v2(
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyHtoD_v2(
                         buf_vec[0].ptr,
                         temp_buffer.as_ptr() as *const std::ffi::c_void,
                         temp_buffer.len()
@@ -514,30 +514,30 @@ pub mod test_utils {
                 .await?;
             if self.cuda_context_1.is_some() {
                 unsafe {
-                    cu_check!(cuda_sys::cuCtxSetCurrent(
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(
                         self.cuda_context_1.expect("No CUDA context found")
                     ));
-                    cu_check!(cuda_sys::cuMemUnmap(
-                        self.buffer_1.ptr as cuda_sys::CUdeviceptr,
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemUnmap(
+                        self.buffer_1.ptr as rdmaxcel_sys::CUdeviceptr,
                         self.buffer_1.len
                     ));
-                    cu_check!(cuda_sys::cuMemAddressFree(
-                        self.buffer_1.ptr as cuda_sys::CUdeviceptr,
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressFree(
+                        self.buffer_1.ptr as rdmaxcel_sys::CUdeviceptr,
                         self.buffer_1.len
                     ));
                 }
             }
             if self.cuda_context_2.is_some() {
                 unsafe {
-                    cu_check!(cuda_sys::cuCtxSetCurrent(
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(
                         self.cuda_context_2.expect("No CUDA context found")
                     ));
-                    cu_check!(cuda_sys::cuMemUnmap(
-                        self.buffer_2.ptr as cuda_sys::CUdeviceptr,
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemUnmap(
+                        self.buffer_2.ptr as rdmaxcel_sys::CUdeviceptr,
                         self.buffer_2.len
                     ));
-                    cu_check!(cuda_sys::cuMemAddressFree(
-                        self.buffer_2.ptr as cuda_sys::CUdeviceptr,
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressFree(
+                        self.buffer_2.ptr as rdmaxcel_sys::CUdeviceptr,
                         self.buffer_2.len
                     ));
                 }
@@ -579,12 +579,12 @@ pub mod test_utils {
                     let mut temp_buffer = vec![0u8; size].into_boxed_slice();
                     // SAFETY: The buffer is allocated with the correct size and the pointer is valid.
                     unsafe {
-                        cu_check!(cuda_sys::cuCtxSetCurrent(
+                        cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(
                             cuda_context.expect("No CUDA context found")
                         ));
-                        cu_check!(cuda_sys::cuMemcpyDtoH_v2(
+                        cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyDtoH_v2(
                             temp_buffer.as_mut_ptr() as *mut std::ffi::c_void,
-                            virtual_addr as cuda_sys::CUdeviceptr,
+                            virtual_addr as rdmaxcel_sys::CUdeviceptr,
                             size
                         ));
                     }
diff --git a/rdmaxcel-sys/build.rs b/rdmaxcel-sys/build.rs
index 8e2361431..9a428b69d 100644
--- a/rdmaxcel-sys/build.rs
+++ b/rdmaxcel-sys/build.rs
@@ -21,10 +21,15 @@ fn main() {
     // Link against the mlx5 library
     println!("cargo:rustc-link-lib=mlx5");
 
+    // Link against dl for dynamic loading
+    println!("cargo:rustc-link-lib=dl");
+
     // Tell cargo to invalidate the built crate whenever the wrapper changes
     println!("cargo:rerun-if-changed=src/rdmaxcel.h");
     println!("cargo:rerun-if-changed=src/rdmaxcel.c");
     println!("cargo:rerun-if-changed=src/rdmaxcel.cpp");
+    println!("cargo:rerun-if-changed=src/driver_api.h");
+    println!("cargo:rerun-if-changed=src/driver_api.cpp");
 
     // Validate CUDA installation and get CUDA home path
     let cuda_home = match build_utils::validate_cuda_installation() {
@@ -88,6 +93,10 @@ fn main() {
         .allowlist_function("pt_cuda_allocator_compatibility")
         .allowlist_function("register_segments")
         .allowlist_function("deregister_segments")
+        .allowlist_function("rdmaxcel_cu.*")
+        .allowlist_function("get_cuda_pci_address_from_ptr")
+        .allowlist_function("rdmaxcel_print_device_info")
+        .allowlist_function("rdmaxcel_error_string")
         .allowlist_type("ibv_.*")
         .allowlist_type("mlx5dv_.*")
         .allowlist_type("mlx5_wqe_.*")
@@ -149,7 +158,8 @@ fn main() {
         }
     };
     println!("cargo:rustc-link-search=native={}", cuda_lib_dir);
-    println!("cargo:rustc-link-lib=cuda");
+    // Note: libcuda is now loaded dynamically via dlopen in driver_api.cpp
+    // Only link cudart (CUDA Runtime API)
     println!("cargo:rustc-link-lib=cudart");
 
     // Link PyTorch C++ libraries for c10 symbols
@@ -213,7 +223,8 @@ fn main() {
 
             // Compile the C++ source file for CUDA allocator compatibility
             let cpp_source_path = format!("{}/src/rdmaxcel.cpp", manifest_dir);
-            if Path::new(&cpp_source_path).exists() {
+            let driver_api_cpp_path = format!("{}/src/driver_api.cpp", manifest_dir);
+            if Path::new(&cpp_source_path).exists() && Path::new(&driver_api_cpp_path).exists() {
                 let mut libtorch_include_dirs: Vec<PathBuf> = vec![];
 
                 // Use the same approach as torch-sys: Python discovery first, env vars as fallback
@@ -249,6 +260,7 @@ fn main() {
                 let mut cpp_build = cc::Build::new();
                 cpp_build
                     .file(&cpp_source_path)
+                    .file(&driver_api_cpp_path)
                     .include(format!("{}/src", manifest_dir))
                     .flag("-fPIC")
                     .cpp(true)
@@ -270,7 +282,12 @@ fn main() {
 
                 cpp_build.compile("rdmaxcel_cpp");
             } else {
-                panic!("C++ source file not found at {}", cpp_source_path);
+                if !Path::new(&cpp_source_path).exists() {
+                    panic!("C++ source file not found at {}", cpp_source_path);
+                }
+                if !Path::new(&driver_api_cpp_path).exists() {
+                    panic!("Driver API C++ source file not found at {}", driver_api_cpp_path);
+                }
             }
             // Compile the CUDA source file
             let cuda_source_path = format!("{}/src/rdmaxcel.cu", manifest_dir);
diff --git a/rdmaxcel-sys/src/driver_api.cpp b/rdmaxcel-sys/src/driver_api.cpp
new file mode 100644
index 000000000..3c6033e7c
--- /dev/null
+++ b/rdmaxcel-sys/src/driver_api.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "driver_api.h"
+#include <dlfcn.h>
+#include <stdexcept>
+#include <iostream>
+
+// List of CUDA driver functions needed by rdmaxcel
+#define RDMAXCEL_CUDA_DRIVER_API(_)  \
+  _(cuMemGetHandleForAddressRange)   \
+  _(cuMemGetAllocationGranularity)   \
+  _(cuMemCreate)                     \
+  _(cuMemAddressReserve)             \
+  _(cuMemMap)                        \
+  _(cuMemSetAccess)                  \
+  _(cuMemUnmap)                      \
+  _(cuMemAddressFree)                \
+  _(cuMemRelease)                    \
+  _(cuMemcpyHtoD_v2)                 \
+  _(cuMemcpyDtoH_v2)                 \
+  _(cuPointerGetAttribute)           \
+  _(cuInit)                          \
+  _(cuDeviceGet)                     \
+  _(cuDeviceGetCount)                \
+  _(cuDeviceGetAttribute)            \
+  _(cuCtxCreate_v2)                  \
+  _(cuCtxSetCurrent)                 \
+  _(cuGetErrorString)
+
+namespace rdmaxcel {
+
+struct DriverAPI {
+#define CREATE_MEMBER(name) decltype(&name) name##_;
+  RDMAXCEL_CUDA_DRIVER_API(CREATE_MEMBER)
+#undef CREATE_MEMBER
+  static DriverAPI* get();
+};
+
+namespace {
+
+DriverAPI create_driver_api() {
+  // Try to open libcuda.so.1 - RTLD_NOLOAD means only succeed if already loaded
+  void* handle = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
+  if (!handle) {
+    std::cerr << "[RdmaXcel] Warning: libcuda.so.1 not loaded, trying to load it now" << std::endl;
+    handle = dlopen("libcuda.so.1", RTLD_LAZY);
+  }
+
+  if (!handle) {
+    throw std::runtime_error(std::string("[RdmaXcel] Can't open libcuda.so.1: ") + dlerror());
+  }
+
+  DriverAPI r{};
+
+#define LOOKUP_CUDA_ENTRY(name)                       \
+  r.name##_ = reinterpret_cast<decltype(&name)>(dlsym(handle, #name)); \
+  if (!r.name##_) { \
+    throw std::runtime_error(std::string("[RdmaXcel] Can't find ") + #name + ": " + dlerror()); \
+  }
+
+  RDMAXCEL_CUDA_DRIVER_API(LOOKUP_CUDA_ENTRY)
+#undef LOOKUP_CUDA_ENTRY
+
+  return r;
+}
+
+} // namespace
+
+DriverAPI* DriverAPI::get() {
+  static DriverAPI singleton = create_driver_api();
+  return &singleton;
+}
+
+} // namespace rdmaxcel
+
+// C API wrapper implementations
+extern "C" {
+
+// Memory management
+CUresult rdmaxcel_cuMemGetHandleForAddressRange(
+    int* handle,
+    CUdeviceptr dptr,
+    size_t size,
+    CUmemRangeHandleType handleType,
+    unsigned long long flags) {
+  return rdmaxcel::DriverAPI::get()->cuMemGetHandleForAddressRange_(
+      handle, dptr, size, handleType, flags);
+}
+
+CUresult rdmaxcel_cuMemGetAllocationGranularity(
+    size_t* granularity,
+    const CUmemAllocationProp* prop,
+    CUmemAllocationGranularity_flags option) {
+  return rdmaxcel::DriverAPI::get()->cuMemGetAllocationGranularity_(
+      granularity, prop, option);
+}
+
+CUresult rdmaxcel_cuMemCreate(
+    CUmemGenericAllocationHandle* handle,
+    size_t size,
+    const CUmemAllocationProp* prop,
+    unsigned long long flags) {
+  return rdmaxcel::DriverAPI::get()->cuMemCreate_(handle, size, prop, flags);
+}
+
+CUresult rdmaxcel_cuMemAddressReserve(
+    CUdeviceptr* ptr,
+    size_t size,
+    size_t alignment,
+    CUdeviceptr addr,
+    unsigned long long flags) {
+  return rdmaxcel::DriverAPI::get()->cuMemAddressReserve_(
+      ptr, size, alignment, addr, flags);
+}
+
+CUresult rdmaxcel_cuMemMap(
+    CUdeviceptr ptr,
+    size_t size,
+    size_t offset,
+    CUmemGenericAllocationHandle handle,
+    unsigned long long flags) {
+  return rdmaxcel::DriverAPI::get()->cuMemMap_(ptr, size, offset, handle, flags);
+}
+
+CUresult rdmaxcel_cuMemSetAccess(
+    CUdeviceptr ptr,
+    size_t size,
+    const CUmemAccessDesc* desc,
+    size_t count) {
+  return rdmaxcel::DriverAPI::get()->cuMemSetAccess_(ptr, size, desc, count);
+}
+
+CUresult rdmaxcel_cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  return rdmaxcel::DriverAPI::get()->cuMemUnmap_(ptr, size);
+}
+
+CUresult rdmaxcel_cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  return rdmaxcel::DriverAPI::get()->cuMemAddressFree_(ptr, size);
+}
+
+CUresult rdmaxcel_cuMemRelease(CUmemGenericAllocationHandle handle) {
+  return rdmaxcel::DriverAPI::get()->cuMemRelease_(handle);
+}
+
+CUresult rdmaxcel_cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount) {
+  return rdmaxcel::DriverAPI::get()->cuMemcpyHtoD_v2_(dstDevice, srcHost, ByteCount);
+}
+
+CUresult rdmaxcel_cuMemcpyDtoH_v2(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) {
+  return rdmaxcel::DriverAPI::get()->cuMemcpyDtoH_v2_(dstHost, srcDevice, ByteCount);
+}
+
+// Pointer queries
+CUresult rdmaxcel_cuPointerGetAttribute(
+    void* data,
+    CUpointer_attribute attribute,
+    CUdeviceptr ptr) {
+  return rdmaxcel::DriverAPI::get()->cuPointerGetAttribute_(
+      data, attribute, ptr);
+}
+
+// Device management
+CUresult rdmaxcel_cuInit(unsigned int Flags) {
+  return rdmaxcel::DriverAPI::get()->cuInit_(Flags);
+}
+
+CUresult rdmaxcel_cuDeviceGet(CUdevice* device, int ordinal) {
+  return rdmaxcel::DriverAPI::get()->cuDeviceGet_(device, ordinal);
+}
+
+CUresult rdmaxcel_cuDeviceGetCount(int* count) {
+  return rdmaxcel::DriverAPI::get()->cuDeviceGetCount_(count);
+}
+
+CUresult rdmaxcel_cuDeviceGetAttribute(
+    int* pi,
+    CUdevice_attribute attrib,
+    CUdevice dev) {
+  return rdmaxcel::DriverAPI::get()->cuDeviceGetAttribute_(pi, attrib, dev);
+}
+
+// Context management
+CUresult rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) {
+  return rdmaxcel::DriverAPI::get()->cuCtxCreate_v2_(pctx, flags, dev);
+}
+
+CUresult rdmaxcel_cuCtxSetCurrent(CUcontext ctx) {
+  return rdmaxcel::DriverAPI::get()->cuCtxSetCurrent_(ctx);
+}
+
+// Error handling
+CUresult rdmaxcel_cuGetErrorString(CUresult error, const char** pStr) {
+  return rdmaxcel::DriverAPI::get()->cuGetErrorString_(error, pStr);
+}
+
+} // extern "C"
+
+
diff --git a/rdmaxcel-sys/src/driver_api.h b/rdmaxcel-sys/src/driver_api.h
new file mode 100644
index 000000000..49eeece20
--- /dev/null
+++ b/rdmaxcel-sys/src/driver_api.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <cuda.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C API wrapper functions for CUDA driver functions
+// These are loaded dynamically via dlopen and exported to Rust via bindgen
+
+// Memory management
+CUresult rdmaxcel_cuMemGetHandleForAddressRange(
+    int* handle,
+    CUdeviceptr dptr,
+    size_t size,
+    CUmemRangeHandleType handleType,
+    unsigned long long flags);
+
+CUresult rdmaxcel_cuMemGetAllocationGranularity(
+    size_t* granularity,
+    const CUmemAllocationProp* prop,
+    CUmemAllocationGranularity_flags option);
+
+CUresult rdmaxcel_cuMemCreate(
+    CUmemGenericAllocationHandle* handle,
+    size_t size,
+    const CUmemAllocationProp* prop,
+    unsigned long long flags);
+
+CUresult rdmaxcel_cuMemAddressReserve(
+    CUdeviceptr* ptr,
+    size_t size,
+    size_t alignment,
+    CUdeviceptr addr,
+    unsigned long long flags);
+
+CUresult rdmaxcel_cuMemMap(
+    CUdeviceptr ptr,
+    size_t size,
+    size_t offset,
+    CUmemGenericAllocationHandle handle,
+    unsigned long long flags);
+
+CUresult rdmaxcel_cuMemSetAccess(
+    CUdeviceptr ptr,
+    size_t size,
+    const CUmemAccessDesc* desc,
+    size_t count);
+
+CUresult rdmaxcel_cuMemUnmap(CUdeviceptr ptr, size_t size);
+
+CUresult rdmaxcel_cuMemAddressFree(CUdeviceptr ptr, size_t size);
+
+CUresult rdmaxcel_cuMemRelease(CUmemGenericAllocationHandle handle);
+
+CUresult rdmaxcel_cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
+
+CUresult rdmaxcel_cuMemcpyDtoH_v2(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+
+// Pointer queries
+CUresult rdmaxcel_cuPointerGetAttribute(
+    void* data,
+    CUpointer_attribute attribute,
+    CUdeviceptr ptr);
+
+// Device management
+CUresult rdmaxcel_cuInit(unsigned int Flags);
+
+CUresult rdmaxcel_cuDeviceGet(CUdevice* device, int ordinal);
+
+CUresult rdmaxcel_cuDeviceGetCount(int* count);
+
+CUresult
+rdmaxcel_cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev);
+
+// Context management
+CUresult rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev);
+
+CUresult rdmaxcel_cuCtxSetCurrent(CUcontext ctx);
+
+// Error handling
+CUresult rdmaxcel_cuGetErrorString(CUresult error, const char** pStr);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
diff --git a/rdmaxcel-sys/src/rdmaxcel.cpp b/rdmaxcel-sys/src/rdmaxcel.cpp
index 45c40d665..0ce750716 100644
--- a/rdmaxcel-sys/src/rdmaxcel.cpp
+++ b/rdmaxcel-sys/src/rdmaxcel.cpp
@@ -7,6 +7,7 @@
  */
 
 #include "rdmaxcel.h"
+#include "driver_api.h"
 #include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <cuda.h>
@@ -235,7 +236,7 @@ int compact_mrs(struct ibv_pd* pd, SegmentInfo& seg, int access_flags) {
 
   // Get dmabuf handle for the entire segment
   int fd = -1;
-  CUresult cu_result = cuMemGetHandleForAddressRange(
+  CUresult cu_result = rdmaxcel_cuMemGetHandleForAddressRange(
       &fd,
       static_cast<CUdeviceptr>(start_addr),
       total_size,
@@ -297,7 +298,7 @@ int register_segments(struct ibv_pd* pd, struct ibv_qp* qp) {
         }
 
         int fd = -1;
-        CUresult cu_result = cuMemGetHandleForAddressRange(
+        CUresult cu_result = rdmaxcel_cuMemGetHandleForAddressRange(
             &fd,
             static_cast<CUdeviceptr>(chunk_start),
             chunk_size,
@@ -352,7 +353,7 @@ int get_cuda_pci_address_from_ptr(
   }
 
   int device_ordinal = -1;
-  CUresult err = cuPointerGetAttribute(
+  CUresult err = rdmaxcel_cuPointerGetAttribute(
       &device_ordinal, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, cuda_ptr);
 
   if (err != CUDA_SUCCESS) {
@@ -360,7 +361,7 @@ int get_cuda_pci_address_from_ptr(
   }
 
   CUdevice device;
-  err = cuDeviceGet(&device, device_ordinal);
+  err = rdmaxcel_cuDeviceGet(&device, device_ordinal);
   if (err != CUDA_SUCCESS) {
     return RDMAXCEL_CUDA_GET_DEVICE_FAILED;
   }
@@ -371,20 +372,20 @@ int get_cuda_pci_address_from_ptr(
 
   // Get PCI bus ID
   err =
-      cuDeviceGetAttribute(&pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device);
+      rdmaxcel_cuDeviceGetAttribute(&pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device);
   if (err != CUDA_SUCCESS) {
     return RDMAXCEL_CUDA_GET_ATTRIBUTE_FAILED;
   }
 
   // Get PCI device ID
-  err = cuDeviceGetAttribute(
+  err = rdmaxcel_cuDeviceGetAttribute(
       &pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device);
   if (err != CUDA_SUCCESS) {
     return RDMAXCEL_CUDA_GET_ATTRIBUTE_FAILED;
   }
 
   // Get PCI domain ID
-  err = cuDeviceGetAttribute(
+  err = rdmaxcel_cuDeviceGetAttribute(
       &pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, device);
   if (err != CUDA_SUCCESS) {
     return RDMAXCEL_CUDA_GET_ATTRIBUTE_FAILED;
diff --git a/rdmaxcel-sys/src/rdmaxcel.h b/rdmaxcel-sys/src/rdmaxcel.h
index b0ade31db..01b01d575 100644
--- a/rdmaxcel-sys/src/rdmaxcel.h
+++ b/rdmaxcel-sys/src/rdmaxcel.h
@@ -13,6 +13,7 @@
 #include <cuda_runtime.h>
 #include <infiniband/mlx5dv.h>
 #include <infiniband/verbs.h>
+#include "driver_api.h"
 
 #ifdef __cplusplus
 extern "C" {

From 412b1ec359258f6791d1b9d3fda60e291a8530c8 Mon Sep 17 00:00:00 2001
From: zdevito <zdevito@meta.com>
Date: Tue, 18 Nov 2025 15:58:27 -0800
Subject: [PATCH 2/5] Update on "Remove direct dependency on libcuda"

Dynamically load  any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one.

Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/)

[ghstack-poisoned]
---
 cuda-sys/src/lib.rs             |  1 -
 monarch_rdma/src/test_utils.rs  | 24 +++++++---
 rdmaxcel-sys/build.rs           |  5 ++-
 rdmaxcel-sys/src/driver_api.cpp | 80 +++++++++++++++++++--------------
 rdmaxcel-sys/src/driver_api.h   | 14 ++++--
 rdmaxcel-sys/src/rdmaxcel.cpp   |  6 +--
 6 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/cuda-sys/src/lib.rs b/cuda-sys/src/lib.rs
index 64ed70c64..a99325278 100644
--- a/cuda-sys/src/lib.rs
+++ b/cuda-sys/src/lib.rs
@@ -34,4 +34,3 @@ mod inner {
 }
 
 pub use inner::*;
-
diff --git a/monarch_rdma/src/test_utils.rs b/monarch_rdma/src/test_utils.rs
index b1407761e..bd187691d 100644
--- a/monarch_rdma/src/test_utils.rs
+++ b/monarch_rdma/src/test_utils.rs
@@ -381,10 +381,17 @@ pub mod test_utils {
                     let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed();
 
                     let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed();
-                    cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, accel.1 as i32));
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(
+                        &mut device,
+                        accel.1 as i32
+                    ));
 
                     let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
-                    cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(&mut context, 0, accel.1 as i32));
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
+                        &mut context,
+                        0,
+                        accel.1 as i32
+                    ));
                     cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
 
                     let mut granularity: usize = 0;
@@ -437,12 +444,15 @@ pub mod test_utils {
 
                     // set access
                     let mut access_desc: rdmaxcel_sys::CUmemAccessDesc = std::mem::zeroed();
-                    access_desc.location.type_ =
-                        rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
+                    access_desc.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
                     access_desc.location.id = device;
-                    access_desc.flags =
-                        rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(dptr, padded_size, &access_desc, 1));
+                    access_desc.flags = rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+                    cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(
+                        dptr,
+                        padded_size,
+                        &access_desc,
+                        1
+                    ));
                     buf_vec.push(Buffer {
                         ptr: dptr,
                         len: padded_size,
diff --git a/rdmaxcel-sys/build.rs b/rdmaxcel-sys/build.rs
index 9a428b69d..036122a66 100644
--- a/rdmaxcel-sys/build.rs
+++ b/rdmaxcel-sys/build.rs
@@ -286,7 +286,10 @@ fn main() {
                     panic!("C++ source file not found at {}", cpp_source_path);
                 }
                 if !Path::new(&driver_api_cpp_path).exists() {
-                    panic!("Driver API C++ source file not found at {}", driver_api_cpp_path);
+                    panic!(
+                        "Driver API C++ source file not found at {}",
+                        driver_api_cpp_path
+                    );
                 }
             }
             // Compile the CUDA source file
diff --git a/rdmaxcel-sys/src/driver_api.cpp b/rdmaxcel-sys/src/driver_api.cpp
index 3c6033e7c..cde8efc44 100644
--- a/rdmaxcel-sys/src/driver_api.cpp
+++ b/rdmaxcel-sys/src/driver_api.cpp
@@ -8,29 +8,29 @@
 
 #include "driver_api.h"
 #include <dlfcn.h>
-#include <stdexcept>
 #include <iostream>
+#include <stdexcept>
 
 // List of CUDA driver functions needed by rdmaxcel
-#define RDMAXCEL_CUDA_DRIVER_API(_)  \
-  _(cuMemGetHandleForAddressRange)   \
-  _(cuMemGetAllocationGranularity)   \
-  _(cuMemCreate)                     \
-  _(cuMemAddressReserve)             \
-  _(cuMemMap)                        \
-  _(cuMemSetAccess)                  \
-  _(cuMemUnmap)                      \
-  _(cuMemAddressFree)                \
-  _(cuMemRelease)                    \
-  _(cuMemcpyHtoD_v2)                 \
-  _(cuMemcpyDtoH_v2)                 \
-  _(cuPointerGetAttribute)           \
-  _(cuInit)                          \
-  _(cuDeviceGet)                     \
-  _(cuDeviceGetCount)                \
-  _(cuDeviceGetAttribute)            \
-  _(cuCtxCreate_v2)                  \
-  _(cuCtxSetCurrent)                 \
+#define RDMAXCEL_CUDA_DRIVER_API(_) \
+  _(cuMemGetHandleForAddressRange)  \
+  _(cuMemGetAllocationGranularity)  \
+  _(cuMemCreate)                    \
+  _(cuMemAddressReserve)            \
+  _(cuMemMap)                       \
+  _(cuMemSetAccess)                 \
+  _(cuMemUnmap)                     \
+  _(cuMemAddressFree)               \
+  _(cuMemRelease)                   \
+  _(cuMemcpyHtoD_v2)                \
+  _(cuMemcpyDtoH_v2)                \
+  _(cuPointerGetAttribute)          \
+  _(cuInit)                         \
+  _(cuDeviceGet)                    \
+  _(cuDeviceGetCount)               \
+  _(cuDeviceGetAttribute)           \
+  _(cuCtxCreate_v2)                 \
+  _(cuCtxSetCurrent)                \
   _(cuGetErrorString)
 
 namespace rdmaxcel {
@@ -48,20 +48,24 @@ DriverAPI create_driver_api() {
   // Try to open libcuda.so.1 - RTLD_NOLOAD means only succeed if already loaded
   void* handle = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
   if (!handle) {
-    std::cerr << "[RdmaXcel] Warning: libcuda.so.1 not loaded, trying to load it now" << std::endl;
+    std::cerr
+        << "[RdmaXcel] Warning: libcuda.so.1 not loaded, trying to load it now"
+        << std::endl;
     handle = dlopen("libcuda.so.1", RTLD_LAZY);
   }
 
   if (!handle) {
-    throw std::runtime_error(std::string("[RdmaXcel] Can't open libcuda.so.1: ") + dlerror());
+    throw std::runtime_error(
+        std::string("[RdmaXcel] Can't open libcuda.so.1: ") + dlerror());
   }
 
   DriverAPI r{};
 
-#define LOOKUP_CUDA_ENTRY(name)                       \
-  r.name##_ = reinterpret_cast<decltype(&name)>(dlsym(handle, #name)); \
-  if (!r.name##_) { \
-    throw std::runtime_error(std::string("[RdmaXcel] Can't find ") + #name + ": " + dlerror()); \
+#define LOOKUP_CUDA_ENTRY(name)                                            \
+  r.name##_ = reinterpret_cast<decltype(&name)>(dlsym(handle, #name));     \
+  if (!r.name##_) {                                                        \
+    throw std::runtime_error(                                              \
+        std::string("[RdmaXcel] Can't find ") + #name + ": " + dlerror()); \
   }
 
   RDMAXCEL_CUDA_DRIVER_API(LOOKUP_CUDA_ENTRY)
@@ -125,7 +129,8 @@ CUresult rdmaxcel_cuMemMap(
     size_t offset,
     CUmemGenericAllocationHandle handle,
     unsigned long long flags) {
-  return rdmaxcel::DriverAPI::get()->cuMemMap_(ptr, size, offset, handle, flags);
+  return rdmaxcel::DriverAPI::get()->cuMemMap_(
+      ptr, size, offset, handle, flags);
 }
 
 CUresult rdmaxcel_cuMemSetAccess(
@@ -148,12 +153,20 @@ CUresult rdmaxcel_cuMemRelease(CUmemGenericAllocationHandle handle) {
   return rdmaxcel::DriverAPI::get()->cuMemRelease_(handle);
 }
 
-CUresult rdmaxcel_cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount) {
-  return rdmaxcel::DriverAPI::get()->cuMemcpyHtoD_v2_(dstDevice, srcHost, ByteCount);
+CUresult rdmaxcel_cuMemcpyHtoD_v2(
+    CUdeviceptr dstDevice,
+    const void* srcHost,
+    size_t ByteCount) {
+  return rdmaxcel::DriverAPI::get()->cuMemcpyHtoD_v2_(
+      dstDevice, srcHost, ByteCount);
 }
 
-CUresult rdmaxcel_cuMemcpyDtoH_v2(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) {
-  return rdmaxcel::DriverAPI::get()->cuMemcpyDtoH_v2_(dstHost, srcDevice, ByteCount);
+CUresult rdmaxcel_cuMemcpyDtoH_v2(
+    void* dstHost,
+    CUdeviceptr srcDevice,
+    size_t ByteCount) {
+  return rdmaxcel::DriverAPI::get()->cuMemcpyDtoH_v2_(
+      dstHost, srcDevice, ByteCount);
 }
 
 // Pointer queries
@@ -186,7 +199,8 @@ CUresult rdmaxcel_cuDeviceGetAttribute(
 }
 
 // Context management
-CUresult rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) {
+CUresult
+rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) {
   return rdmaxcel::DriverAPI::get()->cuCtxCreate_v2_(pctx, flags, dev);
 }
 
@@ -200,5 +214,3 @@ CUresult rdmaxcel_cuGetErrorString(CUresult error, const char** pStr) {
 }
 
 } // extern "C"
-
-
diff --git a/rdmaxcel-sys/src/driver_api.h b/rdmaxcel-sys/src/driver_api.h
index 49eeece20..2e570df57 100644
--- a/rdmaxcel-sys/src/driver_api.h
+++ b/rdmaxcel-sys/src/driver_api.h
@@ -61,9 +61,15 @@ CUresult rdmaxcel_cuMemAddressFree(CUdeviceptr ptr, size_t size);
 
 CUresult rdmaxcel_cuMemRelease(CUmemGenericAllocationHandle handle);
 
-CUresult rdmaxcel_cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
+CUresult rdmaxcel_cuMemcpyHtoD_v2(
+    CUdeviceptr dstDevice,
+    const void* srcHost,
+    size_t ByteCount);
 
-CUresult rdmaxcel_cuMemcpyDtoH_v2(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+CUresult rdmaxcel_cuMemcpyDtoH_v2(
+    void* dstHost,
+    CUdeviceptr srcDevice,
+    size_t ByteCount);
 
 // Pointer queries
 CUresult rdmaxcel_cuPointerGetAttribute(
@@ -82,7 +88,8 @@ CUresult
 rdmaxcel_cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev);
 
 // Context management
-CUresult rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev);
+CUresult
+rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev);
 
 CUresult rdmaxcel_cuCtxSetCurrent(CUcontext ctx);
 
@@ -92,4 +99,3 @@ CUresult rdmaxcel_cuGetErrorString(CUresult error, const char** pStr);
 #ifdef __cplusplus
 } // extern "C"
 #endif
-
diff --git a/rdmaxcel-sys/src/rdmaxcel.cpp b/rdmaxcel-sys/src/rdmaxcel.cpp
index 0ce750716..9e359895b 100644
--- a/rdmaxcel-sys/src/rdmaxcel.cpp
+++ b/rdmaxcel-sys/src/rdmaxcel.cpp
@@ -7,7 +7,6 @@
  */
 
 #include "rdmaxcel.h"
-#include "driver_api.h"
 #include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <cuda.h>
@@ -15,6 +14,7 @@
 #include <mutex>
 #include <set>
 #include <unordered_map>
+#include "driver_api.h"
 
 // MR size must be a multiple of 2MB
 const size_t MR_ALIGNMENT = 2ULL * 1024 * 1024;
@@ -371,8 +371,8 @@ int get_cuda_pci_address_from_ptr(
   int pci_domain_id = -1;
 
   // Get PCI bus ID
-  err =
-      rdmaxcel_cuDeviceGetAttribute(&pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device);
+  err = rdmaxcel_cuDeviceGetAttribute(
+      &pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device);
   if (err != CUDA_SUCCESS) {
     return RDMAXCEL_CUDA_GET_ATTRIBUTE_FAILED;
   }

From e0a3137b0728bff4ed363ef4d8f2977a814da696 Mon Sep 17 00:00:00 2001
From: zdevito <zdevito@meta.com>
Date: Wed, 19 Nov 2025 09:58:03 -0800
Subject: [PATCH 3/5] Update on "Remove direct dependency on libcuda"

Dynamically load  any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one.

Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/)

[ghstack-poisoned]

From 67ddce51575e2ff8dbee2847bb15cfbc99568b24 Mon Sep 17 00:00:00 2001
From: zdevito <zdevito@meta.com>
Date: Wed, 19 Nov 2025 13:33:10 -0800
Subject: [PATCH 4/5] Update on "Remove direct dependency on libcuda"

Dynamically load  any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one.

Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/)

[ghstack-poisoned]
---
 .../cuda_ping_pong/src/cuda_ping_pong.rs      | 76 +++++++++----------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs
index e4e283807..28c5a4bb2 100644
--- a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs
+++ b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs
@@ -269,45 +269,45 @@ impl Actor for CudaRdmaActor {
         // For this example, we'll use a regular Rust allocation as a placeholder
         // The actual CUDA allocation would be handled by the monarch_rdma library
         unsafe {
-            cu_check!(cuda_sys::cuInit(0));
-            let mut dptr: cuda_sys::CUdeviceptr = std::mem::zeroed();
-            let mut handle: cuda_sys::CUmemGenericAllocationHandle = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuInit(0));
+            let mut dptr: rdmaxcel_sys::CUdeviceptr = std::mem::zeroed();
+            let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed();
 
-            let mut device: cuda_sys::CUdevice = std::mem::zeroed();
-            cu_check!(cuda_sys::cuDeviceGet(&mut device, device_id as i32));
+            let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, device_id as i32));
 
-            let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-            cu_check!(cuda_sys::cuCtxCreate_v2(&mut context, 0, device_id as i32));
-            cu_check!(cuda_sys::cuCtxSetCurrent(context));
+            let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(&mut context, 0, device_id as i32));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
 
             let mut granularity: usize = 0;
-            let mut prop: cuda_sys::CUmemAllocationProp = std::mem::zeroed();
-            prop.type_ = cuda_sys::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type_ = cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE;
+            let mut prop: rdmaxcel_sys::CUmemAllocationProp = std::mem::zeroed();
+            prop.type_ = rdmaxcel_sys::CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
             prop.location.id = device;
             prop.allocFlags.gpuDirectRDMACapable = 1;
             prop.requestedHandleTypes =
-                cuda_sys::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+                rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
-            cu_check!(cuda_sys::cuMemGetAllocationGranularity(
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemGetAllocationGranularity(
                 &mut granularity as *mut usize,
                 &prop,
-                cuda_sys::CUmemAllocationGranularity_flags::CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+                rdmaxcel_sys::CU_MEM_ALLOC_GRANULARITY_MINIMUM,
             ));
 
             // ensure our size is aligned
             let padded_size: usize = ((buffer_size - 1) / granularity + 1) * granularity;
             assert!(padded_size == buffer_size);
 
-            cu_check!(cuda_sys::cuMemCreate(
-                &mut handle as *mut cuda_sys::CUmemGenericAllocationHandle,
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemCreate(
+                &mut handle as *mut rdmaxcel_sys::CUmemGenericAllocationHandle,
                 padded_size,
                 &prop,
                 0
             ));
             // reserve and map the memory
-            cu_check!(cuda_sys::cuMemAddressReserve(
-                &mut dptr as *mut cuda_sys::CUdeviceptr,
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressReserve(
+                &mut dptr as *mut rdmaxcel_sys::CUdeviceptr,
                 padded_size,
                 0,
                 0,
@@ -317,23 +317,23 @@ impl Actor for CudaRdmaActor {
             assert!(padded_size % granularity == 0);
 
             // fails if a add cu_check macro; but passes if we don't
-            let err = cuda_sys::cuMemMap(
-                dptr as cuda_sys::CUdeviceptr,
+            let err = rdmaxcel_sys::rdmaxcel_cuMemMap(
+                dptr as rdmaxcel_sys::CUdeviceptr,
                 padded_size,
                 0,
-                handle as cuda_sys::CUmemGenericAllocationHandle,
+                handle as rdmaxcel_sys::CUmemGenericAllocationHandle,
                 0,
             );
-            if err != cuda_sys::CUresult::CUDA_SUCCESS {
+            if err != rdmaxcel_sys::CUDA_SUCCESS {
                 panic!("failed reserving and mapping memory {:?}", err);
             }
 
             // set access
-            let mut access_desc: cuda_sys::CUmemAccessDesc = std::mem::zeroed();
-            access_desc.location.type_ = cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE;
+            let mut access_desc: rdmaxcel_sys::CUmemAccessDesc = std::mem::zeroed();
+            access_desc.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
             access_desc.location.id = device;
-            access_desc.flags = cuda_sys::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            cu_check!(cuda_sys::cuMemSetAccess(dptr, padded_size, &access_desc, 1));
+            access_desc.flags = rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(dptr, padded_size, &access_desc, 1));
             Ok(Self {
                 device_id,
                 cpu_buffer,
@@ -385,15 +385,15 @@ impl Handler<InitializeBuffer> for CudaRdmaActor {
         self.cpu_buffer.fill(value);
 
         unsafe {
-            let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-            cu_check!(cuda_sys::cuCtxCreate_v2(
+            let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
                 &mut context,
                 0,
                 self.device_id as i32
             ));
-            cu_check!(cuda_sys::cuCtxSetCurrent(context));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
             cuda_sys::cudaDeviceSynchronize();
-            cu_check!(cuda_sys::cuMemcpyHtoD_v2(
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyHtoD_v2(
                 self.cu_ptr as u64,
                 self.cpu_buffer.as_ptr() as *const std::ffi::c_void,
                 self.cpu_buffer.len()
@@ -459,13 +459,13 @@ impl Handler<PerformPingPong> for CudaRdmaActor {
 
         validate_execution_context().await?;
         unsafe {
-            let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-            cu_check!(cuda_sys::cuCtxCreate_v2(
+            let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
                 &mut context,
                 0,
                 self.device_id as i32
             ));
-            cu_check!(cuda_sys::cuCtxSetCurrent(context));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
         }
         let qp = self
             .rdma_manager
@@ -532,17 +532,17 @@ impl Handler<VerifyBuffer> for CudaRdmaActor {
         VerifyBuffer(expected_values, reply): VerifyBuffer,
     ) -> Result<(), anyhow::Error> {
         unsafe {
-            let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-            cu_check!(cuda_sys::cuCtxCreate_v2(
+            let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
                 &mut context,
                 0,
                 self.device_id as i32
             ));
-            cu_check!(cuda_sys::cuCtxSetCurrent(context));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
             cuda_sys::cudaDeviceSynchronize();
-            cu_check!(cuda_sys::cuMemcpyDtoH_v2(
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyDtoH_v2(
                 self.cpu_buffer.as_mut_ptr() as *mut std::ffi::c_void,
-                self.cu_ptr as cuda_sys::CUdeviceptr,
+                self.cu_ptr as rdmaxcel_sys::CUdeviceptr,
                 self.cpu_buffer.len(),
             ));
         }

From eb95a8452a0415d0553a41055df0a9c9a439d298 Mon Sep 17 00:00:00 2001
From: zdevito <zdevito@meta.com>
Date: Wed, 19 Nov 2025 13:56:29 -0800
Subject: [PATCH 5/5] Update on "Remove direct dependency on libcuda"

Dynamically load  any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one.

Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/)

[ghstack-poisoned]
---
 .../cuda_ping_pong/src/cuda_ping_pong.rs      | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs
index 28c5a4bb2..27e41d79d 100644
--- a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs
+++ b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs
@@ -274,10 +274,17 @@ impl Actor for CudaRdmaActor {
             let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed();
 
             let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed();
-            cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, device_id as i32));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(
+                &mut device,
+                device_id as i32
+            ));
 
             let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
-            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(&mut context, 0, device_id as i32));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
+                &mut context,
+                0,
+                device_id as i32
+            ));
             cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
 
             let mut granularity: usize = 0;
@@ -286,8 +293,7 @@ impl Actor for CudaRdmaActor {
             prop.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
             prop.location.id = device;
             prop.allocFlags.gpuDirectRDMACapable = 1;
-            prop.requestedHandleTypes =
-                rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+            prop.requestedHandleTypes = rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
             cu_check!(rdmaxcel_sys::rdmaxcel_cuMemGetAllocationGranularity(
                 &mut granularity as *mut usize,
@@ -333,7 +339,12 @@ impl Actor for CudaRdmaActor {
             access_desc.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
             access_desc.location.id = device;
             access_desc.flags = rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(dptr, padded_size, &access_desc, 1));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(
+                dptr,
+                padded_size,
+                &access_desc,
+                1
+            ));
             Ok(Self {
                 device_id,
                 cpu_buffer,