From 59f304b9e9556073f0a371b4bb2e9160ddae906c Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Mon, 6 Jun 2016 11:34:01 -0400 Subject: [PATCH 1/2] coll/base: neg. priority cleanup, verbose output improvements * Print a verbose message if the component was disqualified because of a negative priority. * If a disqualified component provided a module, release it. * Display list of selected components in priority order - During the process of volunteering collective functions for a communicator, print the component name and priority. This will cause the verbose messages to be displayed in reverse priority order (lowest priority first, up to highest). This is helpful when determining which collective components are active in which order for a given communicator. To see the messages you need the following MCA parameter set to 9 or higher: `-mca coll_base_verbose 9` * Adjust verbose for commonly needed verbose output from 10 to 9 to make it easier to access this information. --- ompi/mca/coll/base/coll_base_comm_select.c | 24 +++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/ompi/mca/coll/base/coll_base_comm_select.c b/ompi/mca/coll/base/coll_base_comm_select.c index 8695f570328..6335654f3fc 100644 --- a/ompi/mca/coll/base/coll_base_comm_select.c +++ b/ompi/mca/coll/base/coll_base_comm_select.c @@ -19,6 +19,7 @@ * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2016 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -52,6 +53,7 @@ struct avail_coll_t { int ac_priority; mca_coll_base_module_2_1_0_t *ac_module; + const char * ac_component_name; }; typedef struct avail_coll_t avail_coll_t; @@ -110,7 +112,7 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) int ret; /* Announce */ - opal_output_verbose(10, ompi_coll_base_framework.framework_output, + opal_output_verbose(9, ompi_coll_base_framework.framework_output, "coll:base:comm_select: new communicator: %s (cid %d)", comm->c_name, comm->c_contextid); @@ -143,6 +145,12 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm) /* initialize the module */ ret = avail->ac_module->coll_module_enable(avail->ac_module, comm); + + opal_output_verbose(9, ompi_coll_base_framework.framework_output, + "coll:base:comm_select: selecting %10s, priority %3d, %s", + avail->ac_component_name, avail->ac_priority, + (OMPI_SUCCESS == ret ? "Enabled": "Disabled") ); + if (OMPI_SUCCESS == ret) { /* copy over any of the pointers */ @@ -295,9 +303,23 @@ static opal_list_t *check_components(opal_list_t * components, avail = OBJ_NEW(avail_coll_t); avail->ac_priority = priority; avail->ac_module = module; + // Point to the string so we don't have to free later + avail->ac_component_name = component->mca_component_name; opal_list_append(selectable, &avail->super); } + else { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:base:comm_select: component disqualified: %s (priority %d < 0)", + component->mca_component_name, priority ); + + // If the disqualified collective returned a module make sure we + // release it here, since it will become a leak otherwise. + if( NULL != module ) { + OBJ_RELEASE(module); + module = NULL; + } + } } /* If we didn't find any available components, return an error */ From 0a09f8bc514acf5a2283d6d443dda2053a710ed1 Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Wed, 8 Jun 2016 23:01:23 -0400 Subject: [PATCH 2/2] coll/hcoll: Protect module destruct when not fully initialized * If hcoll is given a negative priority, but not enabled=0 then the module is constructed, but then destructed before calling it's query(). So the previous pointers are not initialized. If we try to OBJ_RELEASE them in a debug build an assert will fire. This commit adds some protection against that and initializes the _module pointers to NULL. --- ompi/mca/coll/hcoll/coll_hcoll_module.c | 67 ++++++++++++++++++------- 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/ompi/mca/coll/hcoll/coll_hcoll_module.c b/ompi/mca/coll/hcoll/coll_hcoll_module.c index 7814cc1d8e1..bb849dce0fc 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll_module.c +++ b/ompi/mca/coll/hcoll/coll_hcoll_module.c @@ -1,5 +1,6 @@ /** Copyright (c) 2011 Mellanox Technologies. All rights reserved. + Copyright (c) 2016 IBM Corporation. All rights reserved. $COPYRIGHT$ Additional copyrights may follow @@ -50,6 +51,32 @@ static void mca_coll_hcoll_module_clear(mca_coll_hcoll_module_t *hcoll_module) hcoll_module->previous_iallgatherv = NULL; hcoll_module->previous_igatherv = NULL; hcoll_module->previous_ireduce = NULL; + hcoll_module->previous_ialltoall = NULL; + hcoll_module->previous_ialltoallv = NULL; + + hcoll_module->previous_barrier_module = NULL; + hcoll_module->previous_bcast_module = NULL; + hcoll_module->previous_allreduce_module = NULL; + hcoll_module->previous_reduce_module = NULL; + hcoll_module->previous_allgather_module = NULL; + hcoll_module->previous_allgatherv_module = NULL; + hcoll_module->previous_gather_module = NULL; + hcoll_module->previous_gatherv_module = NULL; + hcoll_module->previous_alltoall_module = NULL; + hcoll_module->previous_alltoallv_module = NULL; + hcoll_module->previous_alltoallw_module = NULL; + hcoll_module->previous_reduce_scatter_module = NULL; + hcoll_module->previous_ibarrier_module = NULL; + hcoll_module->previous_ibcast_module = NULL; + hcoll_module->previous_iallreduce_module = NULL; + hcoll_module->previous_ireduce_module = NULL; + hcoll_module->previous_iallgather_module = NULL; + hcoll_module->previous_iallgatherv_module = NULL; + hcoll_module->previous_igatherv_module = NULL; + hcoll_module->previous_ialltoall_module = NULL; + hcoll_module->previous_ialltoallv_module = NULL; + + } static void mca_coll_hcoll_module_construct(mca_coll_hcoll_module_t *hcoll_module) @@ -63,6 +90,8 @@ void mca_coll_hcoll_mem_release_cb(void *buf, size_t length, hcoll_mem_unmap(buf, length, cbdata, from_alloc); } +#define OBJ_RELEASE_IF_NOT_NULL( obj ) if( NULL != (obj) ) OBJ_RELEASE( obj ); + static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module) { int context_destroyed; @@ -79,25 +108,25 @@ static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module destroy hcoll context*/ if (hcoll_module->hcoll_context != NULL){ - OBJ_RELEASE(hcoll_module->previous_barrier_module); - OBJ_RELEASE(hcoll_module->previous_bcast_module); - OBJ_RELEASE(hcoll_module->previous_allreduce_module); - OBJ_RELEASE(hcoll_module->previous_allgather_module); - OBJ_RELEASE(hcoll_module->previous_allgatherv_module); - OBJ_RELEASE(hcoll_module->previous_gatherv_module); - OBJ_RELEASE(hcoll_module->previous_alltoall_module); - OBJ_RELEASE(hcoll_module->previous_alltoallv_module); - OBJ_RELEASE(hcoll_module->previous_reduce_module); - - OBJ_RELEASE(hcoll_module->previous_ibarrier_module); - OBJ_RELEASE(hcoll_module->previous_ibcast_module); - OBJ_RELEASE(hcoll_module->previous_iallreduce_module); - OBJ_RELEASE(hcoll_module->previous_iallgather_module); - OBJ_RELEASE(hcoll_module->previous_iallgatherv_module); - OBJ_RELEASE(hcoll_module->previous_igatherv_module); - OBJ_RELEASE(hcoll_module->previous_ialltoall_module); - OBJ_RELEASE(hcoll_module->previous_ialltoallv_module); - OBJ_RELEASE(hcoll_module->previous_ireduce_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_barrier_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_allgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_gatherv_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_alltoall_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_alltoallv_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_reduce_module); + + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_ibarrier_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_ibcast_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_iallreduce_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_iallgather_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_iallgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_igatherv_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_ialltoall_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_ialltoallv_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_ireduce_module); /* OBJ_RELEASE(hcoll_module->previous_allgatherv_module);