Skip to content

RDMACM failures in Mellanox #1823

@artpol84

Description

@artpol84

Here is some info about the failure. Will post more as I'll discover

The backtrace in the core-file:

(gdb) bt
#0  0x00007fffeeb68db2 in progress_one_device (device=0x0) at btl_openib_component.c:3698
#1  0x00007fffeeb691fa in btl_openib_component_progress () at btl_openib_component.c:3774
#2  0x00007ffff7624f62 in opal_progress () at runtime/opal_progress.c:221
#3  0x00007ffff7ca3d06 in ompi_mpi_init (argc=1, argv=0x7fffffffd9b8, requested=0, provided=0x7fffffffd768) at runtime/ompi_mpi_init.c:838
#4  0x00007ffff7ce58bf in PMPI_Init (argc=0x7fffffffd7ac, argv=0x7fffffffd7a0) at pinit.c:66
#5  0x0000000000400826 in main (argc=1, argv=0x7fffffffd9b8) at hello_c.c:18

The frame:

(gdb) frame 1
#1  0x00007fffeeb691fa in btl_openib_component_progress () at btl_openib_component.c:3774
3774            count += progress_one_device(device);
(gdb) l
3769        }
3770
3771        for(i = 0; i < mca_btl_openib_component.devices_count; i++) {
3772            mca_btl_openib_device_t *device =
3773                (mca_btl_openib_device_t *) opal_pointer_array_get_item(&mca_btl_openib_component.devices, i);
3774            count += progress_one_device(device);
3775        }
3776
3777    #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
3778        /* Check to see if there are any outstanding dtoh CUDA events that
(gdb) p i
$1 = 1
(gdb) p mca_btl_openib_component.devices
$2 = {super = {obj_magic_id = 16046253926196952813, obj_class = 0x7ffff7929060, obj_reference_count = 1, cls_init_file_name =
    0x7fffeeb92233 "btl_openib_component.c", cls_init_lineno = 187}, lock = {super = {obj_magic_id = 16046253926196952813, obj_class =
    0x7ffff79294e0, obj_reference_count = 1, cls_init_file_name = 0x7ffff76e2e1a "class/opal_pointer_array.c", cls_init_lineno = 45},
    m_lock_pthread = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 2, __spins = 0, __list = {__prev = 0x0, __next =
    0x0}}, __size = '\000' <repeats 16 times>, "\002", '\000' <repeats 22 times>, __align = 0}, m_lock_debug = 0, m_lock_file = 0x0,
    m_lock_line = 0, m_lock_atomic = {u = {lock = 0, sparc_lock = 0 '\000', padding = "\000\000\000"}}}, lowest_free = 1, number_free = 0,
  size = 1, max_size = 2147483647, block_size = 0, addr = 0x78c2b0}
(gdb) p mca_btl_openib_component.devices_count
$3 = 2

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions