Skip to content

Commit 90b7edc

Browse files
authored
UCT/CUDA/CUDA_IPC: Reduced log level of rkey unpacking failures. (#10772)
1 parent 375089f commit 90b7edc

File tree

4 files changed

+57
-31
lines changed

4 files changed

+57
-31
lines changed

src/uct/cuda/cuda_ipc/cuda_ipc_cache.c

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ static void uct_cuda_ipc_cache_purge(uct_cuda_ipc_cache_t *cache)
194194

195195
static ucs_status_t
196196
uct_cuda_ipc_open_memhandle_legacy(CUipcMemHandle memh, CUdevice cu_dev,
197-
CUdeviceptr *mapped_addr)
197+
CUdeviceptr *mapped_addr,
198+
ucs_log_level_t log_level)
198199
{
199200
CUresult cuerr;
200201
ucs_status_t status;
@@ -207,8 +208,8 @@ uct_cuda_ipc_open_memhandle_legacy(CUipcMemHandle memh, CUdevice cu_dev,
207208
cuerr = cuIpcOpenMemHandle(mapped_addr, memh,
208209
CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
209210
if (cuerr != CUDA_SUCCESS) {
210-
ucs_debug("cuIpcOpenMemHandle() failed: %s",
211-
uct_cuda_base_cu_get_error_string(cuerr));
211+
ucs_log(log_level, "cuIpcOpenMemHandle() failed: %s",
212+
uct_cuda_base_cu_get_error_string(cuerr));
212213
status = (cuerr == CUDA_ERROR_ALREADY_MAPPED) ?
213214
UCS_ERR_ALREADY_EXISTS : UCS_ERR_INVALID_PARAM;
214215
}
@@ -227,35 +228,38 @@ uct_cuda_ipc_init_access_desc(CUmemAccessDesc *access_desc, CUdevice cu_dev)
227228
}
228229

229230
static ucs_status_t
230-
uct_cuda_ipc_open_memhandle_vmm(uct_cuda_ipc_rkey_t *key, CUdevice cu_dev,
231-
CUdeviceptr *mapped_addr)
231+
uct_cuda_ipc_open_memhandle_vmm(const uct_cuda_ipc_rkey_t *key, CUdevice cu_dev,
232+
CUdeviceptr *mapped_addr,
233+
ucs_log_level_t log_level)
232234
{
233235
CUmemAccessDesc access_desc = {};
234236
ucs_status_t status;
235237
CUdeviceptr dptr;
236238
CUmemGenericAllocationHandle handle;
237239

238-
status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemImportFromShareableHandle(&handle,
240+
status = UCT_CUDADRV_FUNC(cuMemImportFromShareableHandle(&handle,
239241
(void*)&key->ph.handle.fabric_handle,
240-
CU_MEM_HANDLE_TYPE_FABRIC));
242+
CU_MEM_HANDLE_TYPE_FABRIC), log_level);
241243
if (status != UCS_OK) {
242244
goto out;
243245
}
244246

245-
status =
246-
UCT_CUDADRV_FUNC_LOG_ERR(cuMemAddressReserve(&dptr, key->b_len, 0, 0, 0));
247+
status = UCT_CUDADRV_FUNC(cuMemAddressReserve(&dptr, key->b_len, 0, 0, 0),
248+
log_level);
247249
if (status != UCS_OK) {
248250
goto release_handle;
249251
}
250252

251-
status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemMap(dptr, key->b_len, 0, handle, 0));
253+
status = UCT_CUDADRV_FUNC(cuMemMap(dptr, key->b_len, 0, handle, 0),
254+
log_level);
252255
if (status != UCS_OK) {
253256
goto release_va_range;
254257
}
255258

256259
uct_cuda_ipc_init_access_desc(&access_desc, cu_dev);
257260

258-
status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemSetAccess(dptr, key->b_len, &access_desc, 1));
261+
status = UCT_CUDADRV_FUNC(cuMemSetAccess(dptr, key->b_len, &access_desc, 1),
262+
log_level);
259263
if (status != UCS_OK) {
260264
goto unmap_range;
261265
}
@@ -315,8 +319,9 @@ static ucs_status_t cuda_ipc_rem_mpool_cache_create(uct_cuda_ipc_rkey_t *key,
315319
}
316320

317321
static ucs_status_t
318-
uct_cuda_ipc_open_memhandle_mempool(uct_cuda_ipc_rkey_t *key,
319-
CUdevice cu_dev, CUdeviceptr *mapped_addr)
322+
uct_cuda_ipc_open_memhandle_mempool(uct_cuda_ipc_rkey_t *key, CUdevice cu_dev,
323+
CUdeviceptr *mapped_addr,
324+
ucs_log_level_t log_level)
320325
{
321326
khash_t(cuda_ipc_rem_mpool_cache) *hash = &uct_cuda_ipc_rem_mpool_cache.hash;
322327
const CUmemFabricHandle *hkey = &key->ph.handle.fabric_handle;
@@ -360,18 +365,18 @@ uct_cuda_ipc_open_memhandle_mempool(uct_cuda_ipc_rkey_t *key,
360365
}
361366

362367
out_import_pointer:
363-
status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemPoolImportPointer(mapped_addr,
364-
key->ph.pool, (CUmemPoolPtrExportData*)&key->ph.ptr));
368+
status = UCT_CUDADRV_FUNC(cuMemPoolImportPointer(mapped_addr, key->ph.pool,
369+
(CUmemPoolPtrExportData*)&key->ph.ptr), log_level);
365370

366371
err:
367372
pthread_rwlock_unlock(&uct_cuda_ipc_rem_mpool_cache.lock);
368373
return status;
369374
}
370375
#endif
371376

372-
static ucs_status_t uct_cuda_ipc_open_memhandle(uct_cuda_ipc_rkey_t *key,
373-
CUdevice cu_dev,
374-
CUdeviceptr *mapped_addr)
377+
static ucs_status_t
378+
uct_cuda_ipc_open_memhandle(uct_cuda_ipc_rkey_t *key, CUdevice cu_dev,
379+
CUdeviceptr *mapped_addr, ucs_log_level_t log_level)
375380
{
376381
ucs_log_level_t level;
377382

@@ -380,12 +385,14 @@ static ucs_status_t uct_cuda_ipc_open_memhandle(uct_cuda_ipc_rkey_t *key,
380385
switch(key->ph.handle_type) {
381386
case UCT_CUDA_IPC_KEY_HANDLE_TYPE_LEGACY:
382387
return uct_cuda_ipc_open_memhandle_legacy(key->ph.handle.legacy, cu_dev,
383-
mapped_addr);
388+
mapped_addr, log_level);
384389
#if HAVE_CUDA_FABRIC
385390
case UCT_CUDA_IPC_KEY_HANDLE_TYPE_VMM:
386-
return uct_cuda_ipc_open_memhandle_vmm(key, cu_dev, mapped_addr);
391+
return uct_cuda_ipc_open_memhandle_vmm(key, cu_dev, mapped_addr,
392+
log_level);
387393
case UCT_CUDA_IPC_KEY_HANDLE_TYPE_MEMPOOL:
388-
return uct_cuda_ipc_open_memhandle_mempool(key, cu_dev, mapped_addr);
394+
return uct_cuda_ipc_open_memhandle_mempool(key, cu_dev, mapped_addr,
395+
log_level);
389396
#endif
390397
case UCT_CUDA_IPC_KEY_HANDLE_TYPE_NO_IPC:
391398
level = UCS_LOG_LEVEL_DEBUG;
@@ -520,8 +527,9 @@ ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, uintptr_t d_bptr,
520527
}
521528

522529
UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle,
523-
(key, cu_dev, mapped_addr),
524-
uct_cuda_ipc_rkey_t *key, CUdevice cu_dev, void **mapped_addr)
530+
(key, cu_dev, mapped_addr, log_level),
531+
uct_cuda_ipc_rkey_t *key, CUdevice cu_dev, void **mapped_addr,
532+
ucs_log_level_t log_level)
525533
{
526534
uct_cuda_ipc_cache_t *cache;
527535
ucs_status_t status;
@@ -570,22 +578,23 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle,
570578
}
571579
}
572580

573-
status = uct_cuda_ipc_open_memhandle(key, cu_dev, (CUdeviceptr*)mapped_addr);
581+
status = uct_cuda_ipc_open_memhandle(key, cu_dev, (CUdeviceptr*)mapped_addr,
582+
log_level);
574583
if (ucs_unlikely(status != UCS_OK)) {
575584
if (ucs_likely(status == UCS_ERR_ALREADY_EXISTS)) {
576585
/* unmap all overlapping regions and retry*/
577586
uct_cuda_ipc_cache_invalidate_regions(cache, (void *)key->d_bptr,
578587
UCS_PTR_BYTE_OFFSET(key->d_bptr,
579588
key->b_len));
580589
status = uct_cuda_ipc_open_memhandle(key, cu_dev,
581-
(CUdeviceptr*)mapped_addr);
590+
(CUdeviceptr*)mapped_addr,
591+
log_level);
582592
if (ucs_unlikely(status != UCS_OK)) {
583593
if (ucs_likely(status == UCS_ERR_ALREADY_EXISTS)) {
584594
/* unmap all cache entries and retry */
585595
uct_cuda_ipc_cache_purge(cache);
586-
status =
587-
uct_cuda_ipc_open_memhandle(key, cu_dev,
588-
(CUdeviceptr*)mapped_addr);
596+
status = uct_cuda_ipc_open_memhandle(
597+
key, cu_dev, (CUdeviceptr*)mapped_addr, log_level);
589598
if (status != UCS_OK) {
590599
ucs_fatal("%s: failed to open ipc mem handle. addr:%p "
591600
"len:%lu (%s)", cache->name,

src/uct/cuda/cuda_ipc/cuda_ipc_cache.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,24 @@ ucs_status_t uct_cuda_ipc_create_cache(uct_cuda_ipc_cache_t **cache,
4444
void uct_cuda_ipc_destroy_cache(uct_cuda_ipc_cache_t *cache);
4545

4646

47+
/**
48+
* @brief Map an interprocess memory handle to a local address
49+
*
50+
* This function maps an interprocess memory handle exported from another
51+
* process to a local virtual address that can be accessed by the current
52+
* process.
53+
*
54+
* @param key Pointer to the CUDA IPC remote memory key containing
55+
* the memory handle and other metadata needed for mapping
56+
* @param cu_dev CUDA device handle where the memory should be mapped
57+
* @param mapped_addr Pointer to store the resulting mapped local address
58+
* @param log_level Log level for reporting failures during mapping operation
59+
*
60+
* @return UCS_OK on success, or error status on failure
61+
*/
4762
ucs_status_t
4863
uct_cuda_ipc_map_memhandle(uct_cuda_ipc_rkey_t *key, CUdevice cu_dev,
49-
void **mapped_addr);
64+
void **mapped_addr, ucs_log_level_t log_level);
5065

5166

5267
ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, uintptr_t d_bptr,

src/uct/cuda/cuda_ipc/cuda_ipc_ep.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr,
113113
return status;
114114
}
115115

116-
status = uct_cuda_ipc_map_memhandle(&key->super, cuda_device, &mapped_addr);
116+
status = uct_cuda_ipc_map_memhandle(&key->super, cuda_device, &mapped_addr,
117+
UCS_LOG_LEVEL_ERROR);
117118
if (ucs_unlikely(status != UCS_OK)) {
118119
goto out;
119120
}

src/uct/cuda/cuda_ipc/cuda_ipc_md.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,8 @@ uct_cuda_ipc_is_peer_accessible(uct_cuda_ipc_component_t *component,
347347
* Now, we immediately insert into cache to save on calling
348348
* OpenMemHandle for the same handle because the cache is globally
349349
* accessible using rkey->pid. */
350-
status = uct_cuda_ipc_map_memhandle(&rkey->super, cu_dev, &d_mapped);
350+
status = uct_cuda_ipc_map_memhandle(&rkey->super, cu_dev, &d_mapped,
351+
UCS_LOG_LEVEL_DEBUG);
351352

352353
*accessible = ((status == UCS_OK) || (status == UCS_ERR_ALREADY_EXISTS))
353354
? UCS_YES : UCS_NO;

0 commit comments

Comments
 (0)