meta-pytorch · zdevito · Nov 18, 2025 · Nov 18, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/cuda-sys/build.rs b/cuda-sys/build.rs
@@ -31,11 +31,11 @@ fn main() {
         .clang_arg("c++")
         .clang_arg("-std=gnu++20")
         .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
-        // Allow the specified functions and types
-        .allowlist_function("cu.*")
-        .allowlist_function("CU.*")
-        .allowlist_type("cu.*")
-        .allowlist_type("CU.*")
+        // Allow the specified functions and types (CUDA Runtime API only)
+        .allowlist_function("cuda.*")
+        .allowlist_function("CUDA.*")
+        .allowlist_type("cuda.*")
+        .allowlist_type("CUDA.*")
         // Use newtype enum style
         .default_enum_style(bindgen::EnumVariation::NewType {
             is_bitfield: false,
@@ -78,7 +78,6 @@ fn main() {
         }
     };
     println!("cargo:rustc-link-search=native={}", cuda_lib_dir);
-    println!("cargo:rustc-link-lib=cuda");
     println!("cargo:rustc-link-lib=cudart");
 
     // Generate bindings - fail fast if this doesn't work

diff --git a/cuda-sys/src/lib.rs b/cuda-sys/src/lib.rs
@@ -34,20 +34,3 @@ mod inner {
 }
 
 pub use inner::*;
-
-#[cfg(test)]
-mod tests {
-    use std::mem::MaybeUninit;
-
-    use super::*;
-
-    #[test]
-    fn sanity() {
-        // SAFETY: testing bindings
-        unsafe {
-            let mut version = MaybeUninit::<i32>::uninit();
-            let result = cuDriverGetVersion(version.as_mut_ptr());
-            assert_eq!(result, cudaError_enum(0));
-        }
-    }
-}
diff --git a/cuda-sys/src/wrapper.h b/cuda-sys/src/wrapper.h
@@ -8,5 +8,4 @@
 
 #pragma once
 
-#include <cuda.h>
 #include <cuda_runtime.h>
diff --git a/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs b/monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs
@@ -269,45 +269,51 @@ impl Actor for CudaRdmaActor {
         // For this example, we'll use a regular Rust allocation as a placeholder
         // The actual CUDA allocation would be handled by the monarch_rdma library
         unsafe {
-            cu_check!(cuda_sys::cuInit(0));
-            let mut dptr: cuda_sys::CUdeviceptr = std::mem::zeroed();
-            let mut handle: cuda_sys::CUmemGenericAllocationHandle = std::mem::zeroed();
-
-            let mut device: cuda_sys::CUdevice = std::mem::zeroed();
-            cu_check!(cuda_sys::cuDeviceGet(&mut device, device_id as i32));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuInit(0));
+            let mut dptr: rdmaxcel_sys::CUdeviceptr = std::mem::zeroed();
+            let mut handle: rdmaxcel_sys::CUmemGenericAllocationHandle = std::mem::zeroed();
+
+            let mut device: rdmaxcel_sys::CUdevice = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuDeviceGet(
+                &mut device,
+                device_id as i32
+            ));
 
-            let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-            cu_check!(cuda_sys::cuCtxCreate_v2(&mut context, 0, device_id as i32));
-            cu_check!(cuda_sys::cuCtxSetCurrent(context));
+            let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
+                &mut context,
+                0,
+                device_id as i32
+            ));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
 
             let mut granularity: usize = 0;
-            let mut prop: cuda_sys::CUmemAllocationProp = std::mem::zeroed();
-            prop.type_ = cuda_sys::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED;
-            prop.location.type_ = cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE;
+            let mut prop: rdmaxcel_sys::CUmemAllocationProp = std::mem::zeroed();
+            prop.type_ = rdmaxcel_sys::CU_MEM_ALLOCATION_TYPE_PINNED;
+            prop.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
             prop.location.id = device;
             prop.allocFlags.gpuDirectRDMACapable = 1;
-            prop.requestedHandleTypes =
-                cuda_sys::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+            prop.requestedHandleTypes = rdmaxcel_sys::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
-            cu_check!(cuda_sys::cuMemGetAllocationGranularity(
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemGetAllocationGranularity(
                 &mut granularity as *mut usize,
                 &prop,
-                cuda_sys::CUmemAllocationGranularity_flags::CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+                rdmaxcel_sys::CU_MEM_ALLOC_GRANULARITY_MINIMUM,
             ));
 
             // ensure our size is aligned
             let padded_size: usize = ((buffer_size - 1) / granularity + 1) * granularity;
             assert!(padded_size == buffer_size);
 
-            cu_check!(cuda_sys::cuMemCreate(
-                &mut handle as *mut cuda_sys::CUmemGenericAllocationHandle,
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemCreate(
+                &mut handle as *mut rdmaxcel_sys::CUmemGenericAllocationHandle,
                 padded_size,
                 &prop,
                 0
             ));
             // reserve and map the memory
-            cu_check!(cuda_sys::cuMemAddressReserve(
-                &mut dptr as *mut cuda_sys::CUdeviceptr,
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemAddressReserve(
+                &mut dptr as *mut rdmaxcel_sys::CUdeviceptr,
                 padded_size,
                 0,
                 0,
@@ -317,23 +323,28 @@ impl Actor for CudaRdmaActor {
             assert!(padded_size % granularity == 0);
 
             // fails if a add cu_check macro; but passes if we don't
-            let err = cuda_sys::cuMemMap(
-                dptr as cuda_sys::CUdeviceptr,
+            let err = rdmaxcel_sys::rdmaxcel_cuMemMap(
+                dptr as rdmaxcel_sys::CUdeviceptr,
                 padded_size,
                 0,
-                handle as cuda_sys::CUmemGenericAllocationHandle,
+                handle as rdmaxcel_sys::CUmemGenericAllocationHandle,
                 0,
             );
-            if err != cuda_sys::CUresult::CUDA_SUCCESS {
+            if err != rdmaxcel_sys::CUDA_SUCCESS {
                 panic!("failed reserving and mapping memory {:?}", err);
             }
 
             // set access
-            let mut access_desc: cuda_sys::CUmemAccessDesc = std::mem::zeroed();
-            access_desc.location.type_ = cuda_sys::CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE;
+            let mut access_desc: rdmaxcel_sys::CUmemAccessDesc = std::mem::zeroed();
+            access_desc.location.type_ = rdmaxcel_sys::CU_MEM_LOCATION_TYPE_DEVICE;
             access_desc.location.id = device;
-            access_desc.flags = cuda_sys::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            cu_check!(cuda_sys::cuMemSetAccess(dptr, padded_size, &access_desc, 1));
+            access_desc.flags = rdmaxcel_sys::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemSetAccess(
+                dptr,
+                padded_size,
+                &access_desc,
+                1
+            ));
             Ok(Self {
                 device_id,
                 cpu_buffer,
@@ -385,15 +396,15 @@ impl Handler<InitializeBuffer> for CudaRdmaActor {
         self.cpu_buffer.fill(value);
 
         unsafe {
-            let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-            cu_check!(cuda_sys::cuCtxCreate_v2(
+            let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
                 &mut context,
                 0,
                 self.device_id as i32
             ));
-            cu_check!(cuda_sys::cuCtxSetCurrent(context));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
             cuda_sys::cudaDeviceSynchronize();
-            cu_check!(cuda_sys::cuMemcpyHtoD_v2(
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyHtoD_v2(
                 self.cu_ptr as u64,
                 self.cpu_buffer.as_ptr() as *const std::ffi::c_void,
                 self.cpu_buffer.len()
@@ -459,13 +470,13 @@ impl Handler<PerformPingPong> for CudaRdmaActor {
 
         validate_execution_context().await?;
         unsafe {
-            let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-            cu_check!(cuda_sys::cuCtxCreate_v2(
+            let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
                 &mut context,
                 0,
                 self.device_id as i32
             ));
-            cu_check!(cuda_sys::cuCtxSetCurrent(context));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
         }
         let qp = self
             .rdma_manager
@@ -532,17 +543,17 @@ impl Handler<VerifyBuffer> for CudaRdmaActor {
         VerifyBuffer(expected_values, reply): VerifyBuffer,
     ) -> Result<(), anyhow::Error> {
         unsafe {
-            let mut context: cuda_sys::CUcontext = std::mem::zeroed();
-            cu_check!(cuda_sys::cuCtxCreate_v2(
+            let mut context: rdmaxcel_sys::CUcontext = std::mem::zeroed();
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxCreate_v2(
                 &mut context,
                 0,
                 self.device_id as i32
             ));
-            cu_check!(cuda_sys::cuCtxSetCurrent(context));
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
             cuda_sys::cudaDeviceSynchronize();
-            cu_check!(cuda_sys::cuMemcpyDtoH_v2(
+            cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyDtoH_v2(
                 self.cpu_buffer.as_mut_ptr() as *mut std::ffi::c_void,
-                self.cu_ptr as cuda_sys::CUdeviceptr,
+                self.cu_ptr as rdmaxcel_sys::CUdeviceptr,
                 self.cpu_buffer.len(),
             ));
         }

diff --git a/monarch_rdma/src/macros.rs b/monarch_rdma/src/macros.rs
@@ -9,9 +9,9 @@
 #[macro_export]
 macro_rules! cu_check {
     ($result:expr) => {
-        if $result != cuda_sys::CUresult::CUDA_SUCCESS {
+        if $result != rdmaxcel_sys::CUDA_SUCCESS {
             let mut error_string: *const std::os::raw::c_char = std::ptr::null();
-            cuda_sys::cuGetErrorString($result, &mut error_string);
+            rdmaxcel_sys::rdmaxcel_cuGetErrorString($result, &mut error_string);
             panic!(
                 "cuda failure {}:{} {:?} '{}'",
                 file!(),

diff --git a/monarch_rdma/src/rdma_manager_actor.rs b/monarch_rdma/src/rdma_manager_actor.rs
@@ -366,13 +366,13 @@ impl RdmaManagerActor {
     ) -> Result<(RdmaMemoryRegionView, String), anyhow::Error> {
         unsafe {
             let mut mem_type: i32 = 0;
-            let ptr = addr as cuda_sys::CUdeviceptr;
-            let err = cuda_sys::cuPointerGetAttribute(
+            let ptr = addr as rdmaxcel_sys::CUdeviceptr;
+            let err = rdmaxcel_sys::rdmaxcel_cuPointerGetAttribute(
                 &mut mem_type as *mut _ as *mut std::ffi::c_void,
-                cuda_sys::CUpointer_attribute_enum::CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                rdmaxcel_sys::CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
                 ptr,
             );
-            let is_cuda = err == cuda_sys::CUresult::CUDA_SUCCESS;
+            let is_cuda = err == rdmaxcel_sys::CUDA_SUCCESS;
 
             let mut selected_rdma_device = None;
 
@@ -457,11 +457,11 @@ impl RdmaManagerActor {
                 mrv = maybe_mrv.unwrap();
             } else if is_cuda {
                 let mut fd: i32 = -1;
-                cuda_sys::cuMemGetHandleForAddressRange(
-                    &mut fd as *mut i32 as *mut std::ffi::c_void,
-                    addr as cuda_sys::CUdeviceptr,
+                rdmaxcel_sys::rdmaxcel_cuMemGetHandleForAddressRange(
+                    &mut fd,
+                    addr as rdmaxcel_sys::CUdeviceptr,
                     size,
-                    cuda_sys::CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                    rdmaxcel_sys::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
                     0,
                 );
                 mr = rdmaxcel_sys::ibv_reg_dmabuf_mr(domain_pd, 0, size, 0, fd, access.0 as i32);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,5 +8,4 @@

		#pragma once

		#include <cuda.h>
		#include <cuda_runtime.h>