-
Notifications
You must be signed in to change notification settings - Fork 931
Closed
open-mpi/ompi-release
#1248Description
Here is some info about the failure. Will post more as I'll discover
The backtrace in the core-file:
(gdb) bt
#0 0x00007fffeeb68db2 in progress_one_device (device=0x0) at btl_openib_component.c:3698
#1 0x00007fffeeb691fa in btl_openib_component_progress () at btl_openib_component.c:3774
#2 0x00007ffff7624f62 in opal_progress () at runtime/opal_progress.c:221
#3 0x00007ffff7ca3d06 in ompi_mpi_init (argc=1, argv=0x7fffffffd9b8, requested=0, provided=0x7fffffffd768) at runtime/ompi_mpi_init.c:838
#4 0x00007ffff7ce58bf in PMPI_Init (argc=0x7fffffffd7ac, argv=0x7fffffffd7a0) at pinit.c:66
#5 0x0000000000400826 in main (argc=1, argv=0x7fffffffd9b8) at hello_c.c:18
The frame:
(gdb) frame 1
#1 0x00007fffeeb691fa in btl_openib_component_progress () at btl_openib_component.c:3774
3774 count += progress_one_device(device);
(gdb) l
3769 }
3770
3771 for(i = 0; i < mca_btl_openib_component.devices_count; i++) {
3772 mca_btl_openib_device_t *device =
3773 (mca_btl_openib_device_t *) opal_pointer_array_get_item(&mca_btl_openib_component.devices, i);
3774 count += progress_one_device(device);
3775 }
3776
3777 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
3778 /* Check to see if there are any outstanding dtoh CUDA events that
(gdb) p i
$1 = 1
(gdb) p mca_btl_openib_component.devices
$2 = {super = {obj_magic_id = 16046253926196952813, obj_class = 0x7ffff7929060, obj_reference_count = 1, cls_init_file_name =
0x7fffeeb92233 "btl_openib_component.c", cls_init_lineno = 187}, lock = {super = {obj_magic_id = 16046253926196952813, obj_class =
0x7ffff79294e0, obj_reference_count = 1, cls_init_file_name = 0x7ffff76e2e1a "class/opal_pointer_array.c", cls_init_lineno = 45},
m_lock_pthread = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 2, __spins = 0, __list = {__prev = 0x0, __next =
0x0}}, __size = '\000' <repeats 16 times>, "\002", '\000' <repeats 22 times>, __align = 0}, m_lock_debug = 0, m_lock_file = 0x0,
m_lock_line = 0, m_lock_atomic = {u = {lock = 0, sparc_lock = 0 '\000', padding = "\000\000\000"}}}, lowest_free = 1, number_free = 0,
size = 1, max_size = 2147483647, block_size = 0, addr = 0x78c2b0}
(gdb) p mca_btl_openib_component.devices_count
$3 = 2