From 6d1221e0f0bd2e8054efa23c4cd8bc5e19ab2894 Mon Sep 17 00:00:00 2001 From: zdevito Date: Tue, 18 Nov 2025 14:55:16 -0800 Subject: [PATCH 1/5] Remove direct dependency on libcuda Dynamically load any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one. Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/) [ghstack-poisoned] --- cuda-sys/build.rs | 11 +- cuda-sys/src/lib.rs | 16 -- cuda-sys/src/wrapper.h | 1 - monarch_rdma/src/macros.rs | 4 +- monarch_rdma/src/rdma_manager_actor.rs | 16 +- monarch_rdma/src/test_utils.rs | 100 ++++++------ rdmaxcel-sys/build.rs | 23 ++- rdmaxcel-sys/src/driver_api.cpp | 204 +++++++++++++++++++++++++ rdmaxcel-sys/src/driver_api.h | 95 ++++++++++++ rdmaxcel-sys/src/rdmaxcel.cpp | 15 +- rdmaxcel-sys/src/rdmaxcel.h | 1 + 11 files changed, 393 insertions(+), 93 deletions(-) create mode 100644 rdmaxcel-sys/src/driver_api.cpp create mode 100644 rdmaxcel-sys/src/driver_api.h diff --git a/cuda-sys/build.rs b/cuda-sys/build.rs index 663a4197b..22b8b4675 100644 --- a/cuda-sys/build.rs +++ b/cuda-sys/build.rs @@ -31,11 +31,11 @@ fn main() { .clang_arg("c++") .clang_arg("-std=gnu++20") .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) - // Allow the specified functions and types - .allowlist_function("cu.*") - .allowlist_function("CU.*") - .allowlist_type("cu.*") - .allowlist_type("CU.*") + // Allow the specified functions and types (CUDA Runtime API only) + .allowlist_function("cuda.*") + .allowlist_function("CUDA.*") + .allowlist_type("cuda.*") + .allowlist_type("CUDA.*") // Use newtype enum style .default_enum_style(bindgen::EnumVariation::NewType { is_bitfield: false, @@ -78,7 +78,6 @@ fn main() { } }; println!("cargo:rustc-link-search=native={}", cuda_lib_dir); - println!("cargo:rustc-link-lib=cuda"); println!("cargo:rustc-link-lib=cudart"); // Generate bindings - fail fast if this doesn't work diff --git a/cuda-sys/src/lib.rs b/cuda-sys/src/lib.rs index 2fc1d05ba..64ed70c64 100644 --- a/cuda-sys/src/lib.rs +++ b/cuda-sys/src/lib.rs @@ -35,19 +35,3 @@ mod inner { pub use inner::*; -#[cfg(test)] -mod tests { - use std::mem::MaybeUninit; - - use super::*; - - #[test] - fn sanity() { - // SAFETY: testing bindings - unsafe { - let mut version = MaybeUninit::::uninit(); - let result = cuDriverGetVersion(version.as_mut_ptr()); - assert_eq!(result, cudaError_enum(0)); - } - } -} diff --git a/cuda-sys/src/wrapper.h b/cuda-sys/src/wrapper.h index 04b61ecf7..02b4028d6 100644 --- a/cuda-sys/src/wrapper.h +++ b/cuda-sys/src/wrapper.h @@ -8,5 +8,4 @@ #pragma once -#include #include diff --git a/monarch_rdma/src/macros.rs b/monarch_rdma/src/macros.rs index 785d13924..097ac9b60 100644 --- a/monarch_rdma/src/macros.rs +++ b/monarch_rdma/src/macros.rs @@ -9,9 +9,9 @@ #[macro_export] macro_rules! cu_check { ($result:expr) => { - if $result != cuda_sys::CUresult::CUDA_SUCCESS { + if $result != rdmaxcel_sys::CUDA_SUCCESS { let mut error_string: *const std::os::raw::c_char = std::ptr::null(); - cuda_sys::cuGetErrorString($result, &mut error_string); + rdmaxcel_sys::rdmaxcel_cuGetErrorString($result, &mut error_string); panic!( "cuda failure {}:{} {:?} '{}'", file!(), diff --git a/monarch_rdma/src/rdma_manager_actor.rs b/monarch_rdma/src/rdma_manager_actor.rs index 2af05fa79..2083d3fa3 100644 --- a/monarch_rdma/src/rdma_manager_actor.rs +++ b/monarch_rdma/src/rdma_manager_actor.rs @@ -366,13 +366,13 @@ impl RdmaManagerActor { ) -> Result<(RdmaMemoryRegionView, String), anyhow::Error> { unsafe { let mut mem_type: i32 = 0; - let ptr = addr as cuda_sys::CUdeviceptr; - let err = cuda_sys::cuPointerGetAttribute( + let ptr = addr as rdmaxcel_sys::CUdeviceptr; + let err = rdmaxcel_sys::rdmaxcel_cuPointerGetAttribute( &mut mem_type as *mut _ as *mut std::ffi::c_void, - cuda_sys::CUpointer_attribute_enum::CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + rdmaxcel_sys::CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr, ); - let is_cuda = err == cuda_sys::CUresult::CUDA_SUCCESS; + let is_cuda = err == rdmaxcel_sys::CUDA_SUCCESS; let mut selected_rdma_device = None; @@ -457,11 +457,11 @@ impl RdmaManagerActor { mrv = maybe_mrv.unwrap(); } else if is_cuda { let mut fd: i32 = -1; - cuda_sys::cuMemGetHandleForAddressRange( - &mut fd as *mut i32 as *mut std::ffi::c_void, - addr as cuda_sys::CUdeviceptr, + rdmaxcel_sys::rdmaxcel_cuMemGetHandleForAddressRange( + &mut fd, + addr as rdmaxcel_sys::CUdeviceptr, size, - cuda_sys::CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, + rdmaxcel_sys::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0, ); mr = rdmaxcel_sys::ibv_reg_dmabuf_mr(domain_pd, 0, size, 0, fd, access.0 as i32); diff --git a/monarch_rdma/src/test_utils.rs b/monarch_rdma/src/test_utils.rs index 113ad9a60..b1407761e 100644 --- a/monarch_rdma/src/test_utils.rs +++ b/monarch_rdma/src/test_utils.rs @@ -46,25 +46,25 @@ pub fn is_cuda_available() -> bool { fn check_cuda_available() -> bool { unsafe { // Try to initialize CUDA - let result = cuda_sys::cuInit(0); + let result = rdmaxcel_sys::rdmaxcel_cuInit(0); - if result != cuda_sys::CUresult::CUDA_SUCCESS { + if result != rdmaxcel_sys::CUDA_SUCCESS { return false; } // Check if there are any CUDA devices let mut device_count: i32 = 0; - let count_result = cuda_sys::cuDeviceGetCount(&mut device_count); + let count_result = rdmaxcel_sys::rdmaxcel_cuDeviceGetCount(&mut device_count); - if count_result != cuda_sys::CUresult::CUDA_SUCCESS || device_count <= 0 { + if count_result != rdmaxcel_sys::CUDA_SUCCESS || device_count <= 0 { return false; } // Try to get the first device to verify it's actually accessible - let mut device: cuda_sys::CUdevice = std::mem::zeroed(); - let device_result = cuda_sys::cuDeviceGet(&mut device, 0); + let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed(); + let device_result = rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, 0); - if device_result != cuda_sys::CUresult::CUDA_SUCCESS { + if device_result != rdmaxcel_sys::CUDA_SUCCESS { return false; } @@ -270,8 +270,8 @@ pub mod test_utils { pub actor_2: ActorRef, pub rdma_handle_1: RdmaBuffer, pub rdma_handle_2: RdmaBuffer, - cuda_context_1: Option, - cuda_context_2: Option, + cuda_context_1: Option, + cuda_context_2: Option, } #[derive(Debug, Clone)] @@ -375,46 +375,46 @@ pub mod test_utils { } // CUDA case unsafe { - cu_check!(cuda_sys::cuInit(0)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuInit(0)); - let mut dptr: cuda_sys::CUdeviceptr = std::mem::zeroed(); - let mut handle: cuda_sys::CUmemGenericAllocationHandle = std::mem::zeroed(); + let mut dptr: rdmaxcel_sys::CUdeviceptr = std::mem::zeroed(); + let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed(); - let mut device: cuda_sys::CUdevice = std::mem::zeroed(); - cu_check!(cuda_sys::cuDeviceGet(&mut device, accel.1 as i32)); + let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed(); + cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, accel.1 as i32)); - let mut context: cuda_sys::CUcontext = std::mem::zeroed(); - cu_check!(cuda_sys::cuCtxCreate_v2(&mut context, 0, accel.1 as i32)); - cu_check!(cuda_sys::cuCtxSetCurrent(context)); + let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed(); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(&mut context, 0, accel.1 as i32)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context)); let mut granularity: usize = 0; - let mut prop: cuda_sys::CUmemAllocationProp = std::mem::zeroed(); - prop.type_ = cuda_sys::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type_ = cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE; + let mut prop: rdmaxcel_sys::CUmemAllocationProp = std::mem::zeroed(); + prop.type_ = rdmaxcel_sys::CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = device; prop.allocFlags.gpuDirectRDMACapable = 1; prop.requestedHandleTypes = - cuda_sys::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - cu_check!(cuda_sys::cuMemGetAllocationGranularity( + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemGetAllocationGranularity( &mut granularity as *mut usize, &prop, - cuda_sys::CUmemAllocationGranularity_flags::CU_MEM_ALLOC_GRANULARITY_MINIMUM, + rdmaxcel_sys::CU_MEM_ALLOC_GRANULARITY_MINIMUM, )); // ensure our size is aligned let /*mut*/ padded_size: usize = ((buffer_size - 1) / granularity + 1) * granularity; assert!(padded_size == buffer_size); - cu_check!(cuda_sys::cuMemCreate( - &mut handle as *mut cuda_sys::CUmemGenericAllocationHandle, + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemCreate( + &mut handle as *mut rdmaxcel_sys::CUmemGenericAllocationHandle, padded_size, &prop, 0 )); // reserve and map the memory - cu_check!(cuda_sys::cuMemAddressReserve( - &mut dptr as *mut cuda_sys::CUdeviceptr, + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressReserve( + &mut dptr as *mut rdmaxcel_sys::CUdeviceptr, padded_size, 0, 0, @@ -424,25 +424,25 @@ pub mod test_utils { assert!(padded_size.is_multiple_of(granularity)); // fails if a add cu_check macro; but passes if we don't - let err = cuda_sys::cuMemMap( - dptr as cuda_sys::CUdeviceptr, + let err = rdmaxcel_sys::rdmaxcel_cuMemMap( + dptr as rdmaxcel_sys::CUdeviceptr, padded_size, 0, - handle as cuda_sys::CUmemGenericAllocationHandle, + handle as rdmaxcel_sys::CUmemGenericAllocationHandle, 0, ); - if err != cuda_sys::CUresult::CUDA_SUCCESS { + if err != rdmaxcel_sys::CUDA_SUCCESS { panic!("failed reserving and mapping memory {:?}", err); } // set access - let mut access_desc: cuda_sys::CUmemAccessDesc = std::mem::zeroed(); + let mut access_desc: rdmaxcel_sys::CUmemAccessDesc = std::mem::zeroed(); access_desc.location.type_ = - cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE; + rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE; access_desc.location.id = device; access_desc.flags = - cuda_sys::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - cu_check!(cuda_sys::cuMemSetAccess(dptr, padded_size, &access_desc, 1)); + rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(dptr, padded_size, &access_desc, 1)); buf_vec.push(Buffer { ptr: dptr, len: padded_size, @@ -460,11 +460,11 @@ pub mod test_utils { } unsafe { // Use the CUDA context that was created for the first buffer - cu_check!(cuda_sys::cuCtxSetCurrent( + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent( cuda_contexts[0].expect("No CUDA context found") )); - cu_check!(cuda_sys::cuMemcpyHtoD_v2( + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyHtoD_v2( buf_vec[0].ptr, temp_buffer.as_ptr() as *const std::ffi::c_void, temp_buffer.len() @@ -514,30 +514,30 @@ pub mod test_utils { .await?; if self.cuda_context_1.is_some() { unsafe { - cu_check!(cuda_sys::cuCtxSetCurrent( + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent( self.cuda_context_1.expect("No CUDA context found") )); - cu_check!(cuda_sys::cuMemUnmap( - self.buffer_1.ptr as cuda_sys::CUdeviceptr, + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemUnmap( + self.buffer_1.ptr as rdmaxcel_sys::CUdeviceptr, self.buffer_1.len )); - cu_check!(cuda_sys::cuMemAddressFree( - self.buffer_1.ptr as cuda_sys::CUdeviceptr, + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressFree( + self.buffer_1.ptr as rdmaxcel_sys::CUdeviceptr, self.buffer_1.len )); } } if self.cuda_context_2.is_some() { unsafe { - cu_check!(cuda_sys::cuCtxSetCurrent( + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent( self.cuda_context_2.expect("No CUDA context found") )); - cu_check!(cuda_sys::cuMemUnmap( - self.buffer_2.ptr as cuda_sys::CUdeviceptr, + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemUnmap( + self.buffer_2.ptr as rdmaxcel_sys::CUdeviceptr, self.buffer_2.len )); - cu_check!(cuda_sys::cuMemAddressFree( - self.buffer_2.ptr as cuda_sys::CUdeviceptr, + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressFree( + self.buffer_2.ptr as rdmaxcel_sys::CUdeviceptr, self.buffer_2.len )); } @@ -579,12 +579,12 @@ pub mod test_utils { let mut temp_buffer = vec![0u8; size].into_boxed_slice(); // SAFETY: The buffer is allocated with the correct size and the pointer is valid. unsafe { - cu_check!(cuda_sys::cuCtxSetCurrent( + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent( cuda_context.expect("No CUDA context found") )); - cu_check!(cuda_sys::cuMemcpyDtoH_v2( + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyDtoH_v2( temp_buffer.as_mut_ptr() as *mut std::ffi::c_void, - virtual_addr as cuda_sys::CUdeviceptr, + virtual_addr as rdmaxcel_sys::CUdeviceptr, size )); } diff --git a/rdmaxcel-sys/build.rs b/rdmaxcel-sys/build.rs index 8e2361431..9a428b69d 100644 --- a/rdmaxcel-sys/build.rs +++ b/rdmaxcel-sys/build.rs @@ -21,10 +21,15 @@ fn main() { // Link against the mlx5 library println!("cargo:rustc-link-lib=mlx5"); + // Link against dl for dynamic loading + println!("cargo:rustc-link-lib=dl"); + // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=src/rdmaxcel.h"); println!("cargo:rerun-if-changed=src/rdmaxcel.c"); println!("cargo:rerun-if-changed=src/rdmaxcel.cpp"); + println!("cargo:rerun-if-changed=src/driver_api.h"); + println!("cargo:rerun-if-changed=src/driver_api.cpp"); // Validate CUDA installation and get CUDA home path let cuda_home = match build_utils::validate_cuda_installation() { @@ -88,6 +93,10 @@ fn main() { .allowlist_function("pt_cuda_allocator_compatibility") .allowlist_function("register_segments") .allowlist_function("deregister_segments") + .allowlist_function("rdmaxcel_cu.*") + .allowlist_function("get_cuda_pci_address_from_ptr") + .allowlist_function("rdmaxcel_print_device_info") + .allowlist_function("rdmaxcel_error_string") .allowlist_type("ibv_.*") .allowlist_type("mlx5dv_.*") .allowlist_type("mlx5_wqe_.*") @@ -149,7 +158,8 @@ fn main() { } }; println!("cargo:rustc-link-search=native={}", cuda_lib_dir); - println!("cargo:rustc-link-lib=cuda"); + // Note: libcuda is now loaded dynamically via dlopen in driver_api.cpp + // Only link cudart (CUDA Runtime API) println!("cargo:rustc-link-lib=cudart"); // Link PyTorch C++ libraries for c10 symbols @@ -213,7 +223,8 @@ fn main() { // Compile the C++ source file for CUDA allocator compatibility let cpp_source_path = format!("{}/src/rdmaxcel.cpp", manifest_dir); - if Path::new(&cpp_source_path).exists() { + let driver_api_cpp_path = format!("{}/src/driver_api.cpp", manifest_dir); + if Path::new(&cpp_source_path).exists() && Path::new(&driver_api_cpp_path).exists() { let mut libtorch_include_dirs: Vec = vec![]; // Use the same approach as torch-sys: Python discovery first, env vars as fallback @@ -249,6 +260,7 @@ fn main() { let mut cpp_build = cc::Build::new(); cpp_build .file(&cpp_source_path) + .file(&driver_api_cpp_path) .include(format!("{}/src", manifest_dir)) .flag("-fPIC") .cpp(true) @@ -270,7 +282,12 @@ fn main() { cpp_build.compile("rdmaxcel_cpp"); } else { - panic!("C++ source file not found at {}", cpp_source_path); + if !Path::new(&cpp_source_path).exists() { + panic!("C++ source file not found at {}", cpp_source_path); + } + if !Path::new(&driver_api_cpp_path).exists() { + panic!("Driver API C++ source file not found at {}", driver_api_cpp_path); + } } // Compile the CUDA source file let cuda_source_path = format!("{}/src/rdmaxcel.cu", manifest_dir); diff --git a/rdmaxcel-sys/src/driver_api.cpp b/rdmaxcel-sys/src/driver_api.cpp new file mode 100644 index 000000000..3c6033e7c --- /dev/null +++ b/rdmaxcel-sys/src/driver_api.cpp @@ -0,0 +1,204 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "driver_api.h" +#include +#include +#include + +// List of CUDA driver functions needed by rdmaxcel +#define RDMAXCEL_CUDA_DRIVER_API(_) \ + _(cuMemGetHandleForAddressRange) \ + _(cuMemGetAllocationGranularity) \ + _(cuMemCreate) \ + _(cuMemAddressReserve) \ + _(cuMemMap) \ + _(cuMemSetAccess) \ + _(cuMemUnmap) \ + _(cuMemAddressFree) \ + _(cuMemRelease) \ + _(cuMemcpyHtoD_v2) \ + _(cuMemcpyDtoH_v2) \ + _(cuPointerGetAttribute) \ + _(cuInit) \ + _(cuDeviceGet) \ + _(cuDeviceGetCount) \ + _(cuDeviceGetAttribute) \ + _(cuCtxCreate_v2) \ + _(cuCtxSetCurrent) \ + _(cuGetErrorString) + +namespace rdmaxcel { + +struct DriverAPI { +#define CREATE_MEMBER(name) decltype(&name) name##_; + RDMAXCEL_CUDA_DRIVER_API(CREATE_MEMBER) +#undef CREATE_MEMBER + static DriverAPI* get(); +}; + +namespace { + +DriverAPI create_driver_api() { + // Try to open libcuda.so.1 - RTLD_NOLOAD means only succeed if already loaded + void* handle = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD); + if (!handle) { + std::cerr << "[RdmaXcel] Warning: libcuda.so.1 not loaded, trying to load it now" << std::endl; + handle = dlopen("libcuda.so.1", RTLD_LAZY); + } + + if (!handle) { + throw std::runtime_error(std::string("[RdmaXcel] Can't open libcuda.so.1: ") + dlerror()); + } + + DriverAPI r{}; + +#define LOOKUP_CUDA_ENTRY(name) \ + r.name##_ = reinterpret_cast(dlsym(handle, #name)); \ + if (!r.name##_) { \ + throw std::runtime_error(std::string("[RdmaXcel] Can't find ") + #name + ": " + dlerror()); \ + } + + RDMAXCEL_CUDA_DRIVER_API(LOOKUP_CUDA_ENTRY) +#undef LOOKUP_CUDA_ENTRY + + return r; +} + +} // namespace + +DriverAPI* DriverAPI::get() { + static DriverAPI singleton = create_driver_api(); + return &singleton; +} + +} // namespace rdmaxcel + +// C API wrapper implementations +extern "C" { + +// Memory management +CUresult rdmaxcel_cuMemGetHandleForAddressRange( + int* handle, + CUdeviceptr dptr, + size_t size, + CUmemRangeHandleType handleType, + unsigned long long flags) { + return rdmaxcel::DriverAPI::get()->cuMemGetHandleForAddressRange_( + handle, dptr, size, handleType, flags); +} + +CUresult rdmaxcel_cuMemGetAllocationGranularity( + size_t* granularity, + const CUmemAllocationProp* prop, + CUmemAllocationGranularity_flags option) { + return rdmaxcel::DriverAPI::get()->cuMemGetAllocationGranularity_( + granularity, prop, option); +} + +CUresult rdmaxcel_cuMemCreate( + CUmemGenericAllocationHandle* handle, + size_t size, + const CUmemAllocationProp* prop, + unsigned long long flags) { + return rdmaxcel::DriverAPI::get()->cuMemCreate_(handle, size, prop, flags); +} + +CUresult rdmaxcel_cuMemAddressReserve( + CUdeviceptr* ptr, + size_t size, + size_t alignment, + CUdeviceptr addr, + unsigned long long flags) { + return rdmaxcel::DriverAPI::get()->cuMemAddressReserve_( + ptr, size, alignment, addr, flags); +} + +CUresult rdmaxcel_cuMemMap( + CUdeviceptr ptr, + size_t size, + size_t offset, + CUmemGenericAllocationHandle handle, + unsigned long long flags) { + return rdmaxcel::DriverAPI::get()->cuMemMap_(ptr, size, offset, handle, flags); +} + +CUresult rdmaxcel_cuMemSetAccess( + CUdeviceptr ptr, + size_t size, + const CUmemAccessDesc* desc, + size_t count) { + return rdmaxcel::DriverAPI::get()->cuMemSetAccess_(ptr, size, desc, count); +} + +CUresult rdmaxcel_cuMemUnmap(CUdeviceptr ptr, size_t size) { + return rdmaxcel::DriverAPI::get()->cuMemUnmap_(ptr, size); +} + +CUresult rdmaxcel_cuMemAddressFree(CUdeviceptr ptr, size_t size) { + return rdmaxcel::DriverAPI::get()->cuMemAddressFree_(ptr, size); +} + +CUresult rdmaxcel_cuMemRelease(CUmemGenericAllocationHandle handle) { + return rdmaxcel::DriverAPI::get()->cuMemRelease_(handle); +} + +CUresult rdmaxcel_cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount) { + return rdmaxcel::DriverAPI::get()->cuMemcpyHtoD_v2_(dstDevice, srcHost, ByteCount); +} + +CUresult rdmaxcel_cuMemcpyDtoH_v2(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) { + return rdmaxcel::DriverAPI::get()->cuMemcpyDtoH_v2_(dstHost, srcDevice, ByteCount); +} + +// Pointer queries +CUresult rdmaxcel_cuPointerGetAttribute( + void* data, + CUpointer_attribute attribute, + CUdeviceptr ptr) { + return rdmaxcel::DriverAPI::get()->cuPointerGetAttribute_( + data, attribute, ptr); +} + +// Device management +CUresult rdmaxcel_cuInit(unsigned int Flags) { + return rdmaxcel::DriverAPI::get()->cuInit_(Flags); +} + +CUresult rdmaxcel_cuDeviceGet(CUdevice* device, int ordinal) { + return rdmaxcel::DriverAPI::get()->cuDeviceGet_(device, ordinal); +} + +CUresult rdmaxcel_cuDeviceGetCount(int* count) { + return rdmaxcel::DriverAPI::get()->cuDeviceGetCount_(count); +} + +CUresult rdmaxcel_cuDeviceGetAttribute( + int* pi, + CUdevice_attribute attrib, + CUdevice dev) { + return rdmaxcel::DriverAPI::get()->cuDeviceGetAttribute_(pi, attrib, dev); +} + +// Context management +CUresult rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) { + return rdmaxcel::DriverAPI::get()->cuCtxCreate_v2_(pctx, flags, dev); +} + +CUresult rdmaxcel_cuCtxSetCurrent(CUcontext ctx) { + return rdmaxcel::DriverAPI::get()->cuCtxSetCurrent_(ctx); +} + +// Error handling +CUresult rdmaxcel_cuGetErrorString(CUresult error, const char** pStr) { + return rdmaxcel::DriverAPI::get()->cuGetErrorString_(error, pStr); +} + +} // extern "C" + + diff --git a/rdmaxcel-sys/src/driver_api.h b/rdmaxcel-sys/src/driver_api.h new file mode 100644 index 000000000..49eeece20 --- /dev/null +++ b/rdmaxcel-sys/src/driver_api.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// C API wrapper functions for CUDA driver functions +// These are loaded dynamically via dlopen and exported to Rust via bindgen + +// Memory management +CUresult rdmaxcel_cuMemGetHandleForAddressRange( + int* handle, + CUdeviceptr dptr, + size_t size, + CUmemRangeHandleType handleType, + unsigned long long flags); + +CUresult rdmaxcel_cuMemGetAllocationGranularity( + size_t* granularity, + const CUmemAllocationProp* prop, + CUmemAllocationGranularity_flags option); + +CUresult rdmaxcel_cuMemCreate( + CUmemGenericAllocationHandle* handle, + size_t size, + const CUmemAllocationProp* prop, + unsigned long long flags); + +CUresult rdmaxcel_cuMemAddressReserve( + CUdeviceptr* ptr, + size_t size, + size_t alignment, + CUdeviceptr addr, + unsigned long long flags); + +CUresult rdmaxcel_cuMemMap( + CUdeviceptr ptr, + size_t size, + size_t offset, + CUmemGenericAllocationHandle handle, + unsigned long long flags); + +CUresult rdmaxcel_cuMemSetAccess( + CUdeviceptr ptr, + size_t size, + const CUmemAccessDesc* desc, + size_t count); + +CUresult rdmaxcel_cuMemUnmap(CUdeviceptr ptr, size_t size); + +CUresult rdmaxcel_cuMemAddressFree(CUdeviceptr ptr, size_t size); + +CUresult rdmaxcel_cuMemRelease(CUmemGenericAllocationHandle handle); + +CUresult rdmaxcel_cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); + +CUresult rdmaxcel_cuMemcpyDtoH_v2(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); + +// Pointer queries +CUresult rdmaxcel_cuPointerGetAttribute( + void* data, + CUpointer_attribute attribute, + CUdeviceptr ptr); + +// Device management +CUresult rdmaxcel_cuInit(unsigned int Flags); + +CUresult rdmaxcel_cuDeviceGet(CUdevice* device, int ordinal); + +CUresult rdmaxcel_cuDeviceGetCount(int* count); + +CUresult +rdmaxcel_cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev); + +// Context management +CUresult rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev); + +CUresult rdmaxcel_cuCtxSetCurrent(CUcontext ctx); + +// Error handling +CUresult rdmaxcel_cuGetErrorString(CUresult error, const char** pStr); + +#ifdef __cplusplus +} // extern "C" +#endif + diff --git a/rdmaxcel-sys/src/rdmaxcel.cpp b/rdmaxcel-sys/src/rdmaxcel.cpp index 45c40d665..0ce750716 100644 --- a/rdmaxcel-sys/src/rdmaxcel.cpp +++ b/rdmaxcel-sys/src/rdmaxcel.cpp @@ -7,6 +7,7 @@ */ #include "rdmaxcel.h" +#include "driver_api.h" #include #include #include @@ -235,7 +236,7 @@ int compact_mrs(struct ibv_pd* pd, SegmentInfo& seg, int access_flags) { // Get dmabuf handle for the entire segment int fd = -1; - CUresult cu_result = cuMemGetHandleForAddressRange( + CUresult cu_result = rdmaxcel_cuMemGetHandleForAddressRange( &fd, static_cast(start_addr), total_size, @@ -297,7 +298,7 @@ int register_segments(struct ibv_pd* pd, struct ibv_qp* qp) { } int fd = -1; - CUresult cu_result = cuMemGetHandleForAddressRange( + CUresult cu_result = rdmaxcel_cuMemGetHandleForAddressRange( &fd, static_cast(chunk_start), chunk_size, @@ -352,7 +353,7 @@ int get_cuda_pci_address_from_ptr( } int device_ordinal = -1; - CUresult err = cuPointerGetAttribute( + CUresult err = rdmaxcel_cuPointerGetAttribute( &device_ordinal, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, cuda_ptr); if (err != CUDA_SUCCESS) { @@ -360,7 +361,7 @@ int get_cuda_pci_address_from_ptr( } CUdevice device; - err = cuDeviceGet(&device, device_ordinal); + err = rdmaxcel_cuDeviceGet(&device, device_ordinal); if (err != CUDA_SUCCESS) { return RDMAXCEL_CUDA_GET_DEVICE_FAILED; } @@ -371,20 +372,20 @@ int get_cuda_pci_address_from_ptr( // Get PCI bus ID err = - cuDeviceGetAttribute(&pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device); + rdmaxcel_cuDeviceGetAttribute(&pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device); if (err != CUDA_SUCCESS) { return RDMAXCEL_CUDA_GET_ATTRIBUTE_FAILED; } // Get PCI device ID - err = cuDeviceGetAttribute( + err = rdmaxcel_cuDeviceGetAttribute( &pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device); if (err != CUDA_SUCCESS) { return RDMAXCEL_CUDA_GET_ATTRIBUTE_FAILED; } // Get PCI domain ID - err = cuDeviceGetAttribute( + err = rdmaxcel_cuDeviceGetAttribute( &pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, device); if (err != CUDA_SUCCESS) { return RDMAXCEL_CUDA_GET_ATTRIBUTE_FAILED; diff --git a/rdmaxcel-sys/src/rdmaxcel.h b/rdmaxcel-sys/src/rdmaxcel.h index b0ade31db..01b01d575 100644 --- a/rdmaxcel-sys/src/rdmaxcel.h +++ b/rdmaxcel-sys/src/rdmaxcel.h @@ -13,6 +13,7 @@ #include #include #include +#include "driver_api.h" #ifdef __cplusplus extern "C" { From 412b1ec359258f6791d1b9d3fda60e291a8530c8 Mon Sep 17 00:00:00 2001 From: zdevito Date: Tue, 18 Nov 2025 15:58:27 -0800 Subject: [PATCH 2/5] Update on "Remove direct dependency on libcuda" Dynamically load any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one. Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/) [ghstack-poisoned] --- cuda-sys/src/lib.rs | 1 - monarch_rdma/src/test_utils.rs | 24 +++++++--- rdmaxcel-sys/build.rs | 5 ++- rdmaxcel-sys/src/driver_api.cpp | 80 +++++++++++++++++++-------------- rdmaxcel-sys/src/driver_api.h | 14 ++++-- rdmaxcel-sys/src/rdmaxcel.cpp | 6 +-- 6 files changed, 80 insertions(+), 50 deletions(-) diff --git a/cuda-sys/src/lib.rs b/cuda-sys/src/lib.rs index 64ed70c64..a99325278 100644 --- a/cuda-sys/src/lib.rs +++ b/cuda-sys/src/lib.rs @@ -34,4 +34,3 @@ mod inner { } pub use inner::*; - diff --git a/monarch_rdma/src/test_utils.rs b/monarch_rdma/src/test_utils.rs index b1407761e..bd187691d 100644 --- a/monarch_rdma/src/test_utils.rs +++ b/monarch_rdma/src/test_utils.rs @@ -381,10 +381,17 @@ pub mod test_utils { let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed(); let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed(); - cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, accel.1 as i32)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet( + &mut device, + accel.1 as i32 + )); let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed(); - cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(&mut context, 0, accel.1 as i32)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2( + &mut context, + 0, + accel.1 as i32 + )); cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context)); let mut granularity: usize = 0; @@ -437,12 +444,15 @@ pub mod test_utils { // set access let mut access_desc: rdmaxcel_sys::CUmemAccessDesc = std::mem::zeroed(); - access_desc.location.type_ = - rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE; + access_desc.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE; access_desc.location.id = device; - access_desc.flags = - rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(dptr, padded_size, &access_desc, 1)); + access_desc.flags = rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess( + dptr, + padded_size, + &access_desc, + 1 + )); buf_vec.push(Buffer { ptr: dptr, len: padded_size, diff --git a/rdmaxcel-sys/build.rs b/rdmaxcel-sys/build.rs index 9a428b69d..036122a66 100644 --- a/rdmaxcel-sys/build.rs +++ b/rdmaxcel-sys/build.rs @@ -286,7 +286,10 @@ fn main() { panic!("C++ source file not found at {}", cpp_source_path); } if !Path::new(&driver_api_cpp_path).exists() { - panic!("Driver API C++ source file not found at {}", driver_api_cpp_path); + panic!( + "Driver API C++ source file not found at {}", + driver_api_cpp_path + ); } } // Compile the CUDA source file diff --git a/rdmaxcel-sys/src/driver_api.cpp b/rdmaxcel-sys/src/driver_api.cpp index 3c6033e7c..cde8efc44 100644 --- a/rdmaxcel-sys/src/driver_api.cpp +++ b/rdmaxcel-sys/src/driver_api.cpp @@ -8,29 +8,29 @@ #include "driver_api.h" #include -#include #include +#include // List of CUDA driver functions needed by rdmaxcel -#define RDMAXCEL_CUDA_DRIVER_API(_) \ - _(cuMemGetHandleForAddressRange) \ - _(cuMemGetAllocationGranularity) \ - _(cuMemCreate) \ - _(cuMemAddressReserve) \ - _(cuMemMap) \ - _(cuMemSetAccess) \ - _(cuMemUnmap) \ - _(cuMemAddressFree) \ - _(cuMemRelease) \ - _(cuMemcpyHtoD_v2) \ - _(cuMemcpyDtoH_v2) \ - _(cuPointerGetAttribute) \ - _(cuInit) \ - _(cuDeviceGet) \ - _(cuDeviceGetCount) \ - _(cuDeviceGetAttribute) \ - _(cuCtxCreate_v2) \ - _(cuCtxSetCurrent) \ +#define RDMAXCEL_CUDA_DRIVER_API(_) \ + _(cuMemGetHandleForAddressRange) \ + _(cuMemGetAllocationGranularity) \ + _(cuMemCreate) \ + _(cuMemAddressReserve) \ + _(cuMemMap) \ + _(cuMemSetAccess) \ + _(cuMemUnmap) \ + _(cuMemAddressFree) \ + _(cuMemRelease) \ + _(cuMemcpyHtoD_v2) \ + _(cuMemcpyDtoH_v2) \ + _(cuPointerGetAttribute) \ + _(cuInit) \ + _(cuDeviceGet) \ + _(cuDeviceGetCount) \ + _(cuDeviceGetAttribute) \ + _(cuCtxCreate_v2) \ + _(cuCtxSetCurrent) \ _(cuGetErrorString) namespace rdmaxcel { @@ -48,20 +48,24 @@ DriverAPI create_driver_api() { // Try to open libcuda.so.1 - RTLD_NOLOAD means only succeed if already loaded void* handle = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD); if (!handle) { - std::cerr << "[RdmaXcel] Warning: libcuda.so.1 not loaded, trying to load it now" << std::endl; + std::cerr + << "[RdmaXcel] Warning: libcuda.so.1 not loaded, trying to load it now" + << std::endl; handle = dlopen("libcuda.so.1", RTLD_LAZY); } if (!handle) { - throw std::runtime_error(std::string("[RdmaXcel] Can't open libcuda.so.1: ") + dlerror()); + throw std::runtime_error( + std::string("[RdmaXcel] Can't open libcuda.so.1: ") + dlerror()); } DriverAPI r{}; -#define LOOKUP_CUDA_ENTRY(name) \ - r.name##_ = reinterpret_cast(dlsym(handle, #name)); \ - if (!r.name##_) { \ - throw std::runtime_error(std::string("[RdmaXcel] Can't find ") + #name + ": " + dlerror()); \ +#define LOOKUP_CUDA_ENTRY(name) \ + r.name##_ = reinterpret_cast(dlsym(handle, #name)); \ + if (!r.name##_) { \ + throw std::runtime_error( \ + std::string("[RdmaXcel] Can't find ") + #name + ": " + dlerror()); \ } RDMAXCEL_CUDA_DRIVER_API(LOOKUP_CUDA_ENTRY) @@ -125,7 +129,8 @@ CUresult rdmaxcel_cuMemMap( size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) { - return rdmaxcel::DriverAPI::get()->cuMemMap_(ptr, size, offset, handle, flags); + return rdmaxcel::DriverAPI::get()->cuMemMap_( + ptr, size, offset, handle, flags); } CUresult rdmaxcel_cuMemSetAccess( @@ -148,12 +153,20 @@ CUresult rdmaxcel_cuMemRelease(CUmemGenericAllocationHandle handle) { return rdmaxcel::DriverAPI::get()->cuMemRelease_(handle); } -CUresult rdmaxcel_cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount) { - return rdmaxcel::DriverAPI::get()->cuMemcpyHtoD_v2_(dstDevice, srcHost, ByteCount); +CUresult rdmaxcel_cuMemcpyHtoD_v2( + CUdeviceptr dstDevice, + const void* srcHost, + size_t ByteCount) { + return rdmaxcel::DriverAPI::get()->cuMemcpyHtoD_v2_( + dstDevice, srcHost, ByteCount); } -CUresult rdmaxcel_cuMemcpyDtoH_v2(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) { - return rdmaxcel::DriverAPI::get()->cuMemcpyDtoH_v2_(dstHost, srcDevice, ByteCount); +CUresult rdmaxcel_cuMemcpyDtoH_v2( + void* dstHost, + CUdeviceptr srcDevice, + size_t ByteCount) { + return rdmaxcel::DriverAPI::get()->cuMemcpyDtoH_v2_( + dstHost, srcDevice, ByteCount); } // Pointer queries @@ -186,7 +199,8 @@ CUresult rdmaxcel_cuDeviceGetAttribute( } // Context management -CUresult rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) { +CUresult +rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) { return rdmaxcel::DriverAPI::get()->cuCtxCreate_v2_(pctx, flags, dev); } @@ -200,5 +214,3 @@ CUresult rdmaxcel_cuGetErrorString(CUresult error, const char** pStr) { } } // extern "C" - - diff --git a/rdmaxcel-sys/src/driver_api.h b/rdmaxcel-sys/src/driver_api.h index 49eeece20..2e570df57 100644 --- a/rdmaxcel-sys/src/driver_api.h +++ b/rdmaxcel-sys/src/driver_api.h @@ -61,9 +61,15 @@ CUresult rdmaxcel_cuMemAddressFree(CUdeviceptr ptr, size_t size); CUresult rdmaxcel_cuMemRelease(CUmemGenericAllocationHandle handle); -CUresult rdmaxcel_cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); +CUresult rdmaxcel_cuMemcpyHtoD_v2( + CUdeviceptr dstDevice, + const void* srcHost, + size_t ByteCount); -CUresult rdmaxcel_cuMemcpyDtoH_v2(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); +CUresult rdmaxcel_cuMemcpyDtoH_v2( + void* dstHost, + CUdeviceptr srcDevice, + size_t ByteCount); // Pointer queries CUresult rdmaxcel_cuPointerGetAttribute( @@ -82,7 +88,8 @@ CUresult rdmaxcel_cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev); // Context management -CUresult rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev); +CUresult +rdmaxcel_cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev); CUresult rdmaxcel_cuCtxSetCurrent(CUcontext ctx); @@ -92,4 +99,3 @@ CUresult rdmaxcel_cuGetErrorString(CUresult error, const char** pStr); #ifdef __cplusplus } // extern "C" #endif - diff --git a/rdmaxcel-sys/src/rdmaxcel.cpp b/rdmaxcel-sys/src/rdmaxcel.cpp index 0ce750716..9e359895b 100644 --- a/rdmaxcel-sys/src/rdmaxcel.cpp +++ b/rdmaxcel-sys/src/rdmaxcel.cpp @@ -7,7 +7,6 @@ */ #include "rdmaxcel.h" -#include "driver_api.h" #include #include #include @@ -15,6 +14,7 @@ #include #include #include +#include "driver_api.h" // MR size must be a multiple of 2MB const size_t MR_ALIGNMENT = 2ULL * 1024 * 1024; @@ -371,8 +371,8 @@ int get_cuda_pci_address_from_ptr( int pci_domain_id = -1; // Get PCI bus ID - err = - rdmaxcel_cuDeviceGetAttribute(&pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device); + err = rdmaxcel_cuDeviceGetAttribute( + &pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device); if (err != CUDA_SUCCESS) { return RDMAXCEL_CUDA_GET_ATTRIBUTE_FAILED; } From e0a3137b0728bff4ed363ef4d8f2977a814da696 Mon Sep 17 00:00:00 2001 From: zdevito Date: Wed, 19 Nov 2025 09:58:03 -0800 Subject: [PATCH 3/5] Update on "Remove direct dependency on libcuda" Dynamically load any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one. Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/) [ghstack-poisoned] From 67ddce51575e2ff8dbee2847bb15cfbc99568b24 Mon Sep 17 00:00:00 2001 From: zdevito Date: Wed, 19 Nov 2025 13:33:10 -0800 Subject: [PATCH 4/5] Update on "Remove direct dependency on libcuda" Dynamically load any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one. Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/) [ghstack-poisoned] --- .../cuda_ping_pong/src/cuda_ping_pong.rs | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs index e4e283807..28c5a4bb2 100644 --- a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs +++ b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs @@ -269,45 +269,45 @@ impl Actor for CudaRdmaActor { // For this example, we'll use a regular Rust allocation as a placeholder // The actual CUDA allocation would be handled by the monarch_rdma library unsafe { - cu_check!(cuda_sys::cuInit(0)); - let mut dptr: cuda_sys::CUdeviceptr = std::mem::zeroed(); - let mut handle: cuda_sys::CUmemGenericAllocationHandle = std::mem::zeroed(); + cu_check!(rdmaxcel_sys::rdmaxcel_cuInit(0)); + let mut dptr: rdmaxcel_sys::CUdeviceptr = std::mem::zeroed(); + let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed(); - let mut device: cuda_sys::CUdevice = std::mem::zeroed(); - cu_check!(cuda_sys::cuDeviceGet(&mut device, device_id as i32)); + let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed(); + cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, device_id as i32)); - let mut context: cuda_sys::CUcontext = std::mem::zeroed(); - cu_check!(cuda_sys::cuCtxCreate_v2(&mut context, 0, device_id as i32)); - cu_check!(cuda_sys::cuCtxSetCurrent(context)); + let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed(); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(&mut context, 0, device_id as i32)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context)); let mut granularity: usize = 0; - let mut prop: cuda_sys::CUmemAllocationProp = std::mem::zeroed(); - prop.type_ = cuda_sys::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type_ = cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE; + let mut prop: rdmaxcel_sys::CUmemAllocationProp = std::mem::zeroed(); + prop.type_ = rdmaxcel_sys::CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = device; prop.allocFlags.gpuDirectRDMACapable = 1; prop.requestedHandleTypes = - cuda_sys::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - cu_check!(cuda_sys::cuMemGetAllocationGranularity( + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemGetAllocationGranularity( &mut granularity as *mut usize, &prop, - cuda_sys::CUmemAllocationGranularity_flags::CU_MEM_ALLOC_GRANULARITY_MINIMUM, + rdmaxcel_sys::CU_MEM_ALLOC_GRANULARITY_MINIMUM, )); // ensure our size is aligned let padded_size: usize = ((buffer_size - 1) / granularity + 1) * granularity; assert!(padded_size == buffer_size); - cu_check!(cuda_sys::cuMemCreate( - &mut handle as *mut cuda_sys::CUmemGenericAllocationHandle, + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemCreate( + &mut handle as *mut rdmaxcel_sys::CUmemGenericAllocationHandle, padded_size, &prop, 0 )); // reserve and map the memory - cu_check!(cuda_sys::cuMemAddressReserve( - &mut dptr as *mut cuda_sys::CUdeviceptr, + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressReserve( + &mut dptr as *mut rdmaxcel_sys::CUdeviceptr, padded_size, 0, 0, @@ -317,23 +317,23 @@ impl Actor for CudaRdmaActor { assert!(padded_size % granularity == 0); // fails if a add cu_check macro; but passes if we don't - let err = cuda_sys::cuMemMap( - dptr as cuda_sys::CUdeviceptr, + let err = rdmaxcel_sys::rdmaxcel_cuMemMap( + dptr as rdmaxcel_sys::CUdeviceptr, padded_size, 0, - handle as cuda_sys::CUmemGenericAllocationHandle, + handle as rdmaxcel_sys::CUmemGenericAllocationHandle, 0, ); - if err != cuda_sys::CUresult::CUDA_SUCCESS { + if err != rdmaxcel_sys::CUDA_SUCCESS { panic!("failed reserving and mapping memory {:?}", err); } // set access - let mut access_desc: cuda_sys::CUmemAccessDesc = std::mem::zeroed(); - access_desc.location.type_ = cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE; + let mut access_desc: rdmaxcel_sys::CUmemAccessDesc = std::mem::zeroed(); + access_desc.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE; access_desc.location.id = device; - access_desc.flags = cuda_sys::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - cu_check!(cuda_sys::cuMemSetAccess(dptr, padded_size, &access_desc, 1)); + access_desc.flags = rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(dptr, padded_size, &access_desc, 1)); Ok(Self { device_id, cpu_buffer, @@ -385,15 +385,15 @@ impl Handler for CudaRdmaActor { self.cpu_buffer.fill(value); unsafe { - let mut context: cuda_sys::CUcontext = std::mem::zeroed(); - cu_check!(cuda_sys::cuCtxCreate_v2( + let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed(); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2( &mut context, 0, self.device_id as i32 )); - cu_check!(cuda_sys::cuCtxSetCurrent(context)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context)); cuda_sys::cudaDeviceSynchronize(); - cu_check!(cuda_sys::cuMemcpyHtoD_v2( + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyHtoD_v2( self.cu_ptr as u64, self.cpu_buffer.as_ptr() as *const std::ffi::c_void, self.cpu_buffer.len() @@ -459,13 +459,13 @@ impl Handler for CudaRdmaActor { validate_execution_context().await?; unsafe { - let mut context: cuda_sys::CUcontext = std::mem::zeroed(); - cu_check!(cuda_sys::cuCtxCreate_v2( + let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed(); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2( &mut context, 0, self.device_id as i32 )); - cu_check!(cuda_sys::cuCtxSetCurrent(context)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context)); } let qp = self .rdma_manager @@ -532,17 +532,17 @@ impl Handler for CudaRdmaActor { VerifyBuffer(expected_values, reply): VerifyBuffer, ) -> Result<(), anyhow::Error> { unsafe { - let mut context: cuda_sys::CUcontext = std::mem::zeroed(); - cu_check!(cuda_sys::cuCtxCreate_v2( + let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed(); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2( &mut context, 0, self.device_id as i32 )); - cu_check!(cuda_sys::cuCtxSetCurrent(context)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context)); cuda_sys::cudaDeviceSynchronize(); - cu_check!(cuda_sys::cuMemcpyDtoH_v2( + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyDtoH_v2( self.cpu_buffer.as_mut_ptr() as *mut std::ffi::c_void, - self.cu_ptr as cuda_sys::CUdeviceptr, + self.cu_ptr as rdmaxcel_sys::CUdeviceptr, self.cpu_buffer.len(), )); } From eb95a8452a0415d0553a41055df0a9c9a439d298 Mon Sep 17 00:00:00 2001 From: zdevito Date: Wed, 19 Nov 2025 13:56:29 -0800 Subject: [PATCH 5/5] Update on "Remove direct dependency on libcuda" Dynamically load any cuda driver functions so that monarch library can be loaded a machine that doesn't have a gpu even if the library is built with one. Differential Revision: [D87380631](https://our.internmc.facebook.com/intern/diff/D87380631/) [ghstack-poisoned] --- .../cuda_ping_pong/src/cuda_ping_pong.rs | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs index 28c5a4bb2..27e41d79d 100644 --- a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs +++ b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs @@ -274,10 +274,17 @@ impl Actor for CudaRdmaActor { let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed(); let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed(); - cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(&mut device, device_id as i32)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet( + &mut device, + device_id as i32 + )); let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed(); - cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(&mut context, 0, device_id as i32)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2( + &mut context, + 0, + device_id as i32 + )); cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context)); let mut granularity: usize = 0; @@ -286,8 +293,7 @@ impl Actor for CudaRdmaActor { prop.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = device; prop.allocFlags.gpuDirectRDMACapable = 1; - prop.requestedHandleTypes = - rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + prop.requestedHandleTypes = rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; cu_check!(rdmaxcel_sys::rdmaxcel_cuMemGetAllocationGranularity( &mut granularity as *mut usize, @@ -333,7 +339,12 @@ impl Actor for CudaRdmaActor { access_desc.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE; access_desc.location.id = device; access_desc.flags = rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(dptr, padded_size, &access_desc, 1)); + cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess( + dptr, + padded_size, + &access_desc, + 1 + )); Ok(Self { device_id, cpu_buffer,