diff --git a/ompi/mca/hook/comm_method/hook_comm_method.h b/ompi/mca/hook/comm_method/hook_comm_method.h index 5f93360d370..4e643780674 100644 --- a/ompi/mca/hook/comm_method/hook_comm_method.h +++ b/ompi/mca/hook/comm_method/hook_comm_method.h @@ -24,6 +24,8 @@ extern int mca_hook_comm_method_verbose; extern int mca_hook_comm_method_output; extern bool mca_hook_comm_method_enable_mpi_init; extern bool mca_hook_comm_method_enable_mpi_finalize; +extern bool mca_hook_comm_method_enable_show_aff; +extern int mca_hook_comm_method_aff_columns; extern int mca_hook_comm_method_max; extern int mca_hook_comm_method_brief; extern char *mca_hook_comm_method_fakefile; diff --git a/ompi/mca/hook/comm_method/hook_comm_method_component.c b/ompi/mca/hook/comm_method/hook_comm_method_component.c index ca27c8703be..e053299e790 100644 --- a/ompi/mca/hook/comm_method/hook_comm_method_component.c +++ b/ompi/mca/hook/comm_method/hook_comm_method_component.c @@ -71,20 +71,25 @@ enum mca_hook_comm_method_mode_flags_t { OMPI_HOOK_COMM_METHOD_INIT = 0x01, /* Display on MPI_FINALIZE */ OMPI_HOOK_COMM_METHOD_FINALIZE = 0x02, + /* show affinity by default, allow noaff to skip it */ + OMPI_HOOK_COMM_METHOD_SKIP_AFFINITY = 0x04, }; int mca_hook_comm_method_verbose = 0; int mca_hook_comm_method_output = -1; bool mca_hook_comm_method_enable_mpi_init = false; bool mca_hook_comm_method_enable_mpi_finalize = false; +bool mca_hook_comm_method_enable_show_aff = true; uint32_t mca_hook_comm_method_enabled_flags = 0x00; int mca_hook_comm_method_max = 12; +int mca_hook_comm_method_aff_columns = 80; int mca_hook_comm_method_brief = 0; char *mca_hook_comm_method_fakefile = NULL; static mca_base_var_enum_value_flag_t mca_hook_comm_method_modes[] = { {.flag = OMPI_HOOK_COMM_METHOD_INIT, .string = "mpi_init"}, {.flag = OMPI_HOOK_COMM_METHOD_FINALIZE, .string = "mpi_finalize"}, + {.flag = OMPI_HOOK_COMM_METHOD_SKIP_AFFINITY, .string = "noaff"}, {0, NULL, 0} }; @@ -137,6 +142,7 @@ static int ompi_hook_comm_method_component_register(void) */ mca_hook_comm_method_enable_mpi_init = false; mca_hook_comm_method_enable_mpi_finalize = false; + mca_hook_comm_method_enable_show_aff = true; mca_base_var_enum_create_flag("ompi_comm_method", mca_hook_comm_method_modes, &mca_hook_comm_method_flags); ret = mca_base_component_var_register(&mca_hook_comm_method_component.hookm_version, "display", @@ -161,6 +167,9 @@ static int ompi_hook_comm_method_component_register(void) if( mca_hook_comm_method_enabled_flags & OMPI_HOOK_COMM_METHOD_FINALIZE ) { mca_hook_comm_method_enable_mpi_finalize = true; } + if( mca_hook_comm_method_enabled_flags & OMPI_HOOK_COMM_METHOD_SKIP_AFFINITY ) { + mca_hook_comm_method_enable_show_aff = false; + } } // hook_comm_method_max @@ -171,6 +180,17 @@ static int ompi_hook_comm_method_component_register(void) OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_READONLY, &mca_hook_comm_method_max); + // hook_comm_method_aff_columns + ret = mca_base_component_var_register(&mca_hook_comm_method_component.hookm_version, "aff_columns", + "Number of hosts for which to print unabbreviated 2d table of comm methods.", + MCA_BASE_VAR_TYPE_INT, NULL, + 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_hook_comm_method_aff_columns); + + (void) mca_base_var_register_synonym(ret, "ompi", "ompi", NULL, "display_comm_aff_columns", MCA_BASE_VAR_SYN_FLAG_INTERNAL); + // hook_comm_method_brief (void) mca_base_component_var_register(&mca_hook_comm_method_component.hookm_version, "brief", "Only print the comm method summary, skip the 2d table.", diff --git a/ompi/mca/hook/comm_method/hook_comm_method_fns.c b/ompi/mca/hook/comm_method/hook_comm_method_fns.c index 42762884dce..2053297b14e 100644 --- a/ompi/mca/hook/comm_method/hook_comm_method_fns.c +++ b/ompi/mca/hook/comm_method/hook_comm_method_fns.c @@ -68,7 +68,7 @@ lookup_mtl_name(void) // Find the send btl's module:component:name for the incoming comm,rank static char* lookup_btl_name_for_send(ompi_communicator_t* comm, int rank) { - ompi_proc_t *dst_proc = ompi_group_peer_lookup_existing(comm->c_remote_group, rank); + ompi_proc_t *dst_proc = ompi_group_peer_lookup(comm->c_remote_group, rank); mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint(dst_proc); if (endpoint && @@ -123,9 +123,9 @@ static void init_string_to_conversion_struct(comm_method_string_conversion_t *data) { data->n = 0; - strcpy(data->str[data->n], "n/a"); + mystrncpy(data->str[data->n], "n/a", COMM_METHOD_STRING_SIZE); ++(data->n); - strcpy(data->str[data->n], "self"); + mystrncpy(data->str[data->n], "self", COMM_METHOD_STRING_SIZE); ++(data->n); } @@ -216,6 +216,11 @@ static char* comm_method_to_string(int method); static int icompar(const void *a, const void *b); static void abbreviate_list_into_string(char *str, int max, int *list, int nlist); static void ompi_report_comm_methods(int called_from_location); +static void host_leader_printstring(char *hoststring, + int myleaderrank, int nleaderranks, ompi_communicator_t *leader_comm, + int cols, int indent); +static void host_local_get_affstring(char **affstring, int indentation, + int mylocalrank, int nlocalranks, ompi_communicator_t *local_comm); void ompi_hook_comm_method_mpi_init_bottom(int argc, char **argv, int requested, int *provided) { @@ -269,18 +274,29 @@ abbreviate_list_into_string(char *str, int max, int *list, int nlist) * the start of a new contiguous chunk, print the previous hi,lo chunk. * In general we can tell if we're allowed to write more into the string * based on whether the previous iteration wrote ".." onto the end. + */ +/* + * Note about string size: 'max' is the number of chars that + * can be printed inside str which was allocated as max+1 total space + * to allow 'max' chars plus a null termination. So for snprintf() + * that translates to size of max+1. Or when writing on the end + * of str, eg snprintfing into &str[strlen(str)] that would be + * max - strlen(str) space for printable chars so + * max - strlen(str) + 1 as the size argument in snprintf(). */ if (list[i] == hi+1) { hi = list[i]; } else if (list[i] > hi) { if (strlen(str)==0 || str[strlen(str)-1] != '.') { if (strlen(str) != 0) { - strcpy(&str[strlen(str)], ", "); + mystrncpy(&str[strlen(str)], ", ", max - strlen(str) + 1); } if (lo != hi) { - sprintf(&str[strlen(str)], "%d - %d", lo, hi); + snprintf(&str[strlen(str)], max - strlen(str) + 1, + "%d - %d", lo, hi); } else { - sprintf(&str[strlen(str)], "%d", lo); + snprintf(&str[strlen(str)], max - strlen(str) + 1, + "%d", lo); } } /* @@ -294,7 +310,7 @@ abbreviate_list_into_string(char *str, int max, int *list, int nlist) && (strlen(str) == 0 || str[strlen(str)-1] != '.')) { - strcpy(&str[strlen(str)], ", .."); + mystrncpy(&str[strlen(str)], ", ..", max - strlen(str) + 1); break; } hi = lo = list[i]; @@ -302,12 +318,14 @@ abbreviate_list_into_string(char *str, int max, int *list, int nlist) } if (strlen(str)==0 || str[strlen(str)-1] != '.') { if (strlen(str)!=0) { - strcpy(&str[strlen(str)], ", "); + mystrncpy(&str[strlen(str)], ", ", max - strlen(str) + 1); } if (lo != hi) { - sprintf(&str[strlen(str)], "%d - %d", lo, hi); + snprintf(&str[strlen(str)], max - strlen(str) + 1, + "%d - %d", lo, hi); } else { - sprintf(&str[strlen(str)], "%d", lo); + snprintf(&str[strlen(str)], max - strlen(str) + 1, + "%d", lo); } } } @@ -320,7 +338,7 @@ abbreviate_list_into_string(char *str, int max, int *list, int nlist) static void ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from finalize { - int numhosts, i, j, k; + int i, j, k; int max2Dprottable = 12; int max2D1Cprottable = 36; int hpmp_myrank; @@ -328,8 +346,9 @@ ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from fi int ret; ompi_communicator_t *local_comm, *leader_comm; int *method; + int *mymethod; char *hoststring; - char **allhoststrings; + char *affstring; int comm_mode; // MODE_IS_BTL / MTL / PML // early return in the case of spawn @@ -364,6 +383,27 @@ ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from fi return; } + char prefix_string[64]; + char *unprefixed_affstring; + prefix_string[0] = 0; + if (mylocalrank == 0) { + myleaderrank = ompi_comm_rank(leader_comm); + snprintf(prefix_string, 64, "H%d: ", myleaderrank); + } + int indentation = strlen(prefix_string); + /* reject really small aff_columns settings*/ + if (mca_hook_comm_method_aff_columns < indentation + 16) { + mca_hook_comm_method_aff_columns = indentation + 16; + } + host_local_get_affstring(&unprefixed_affstring, indentation, + mylocalrank, nlocalranks, local_comm); + if (mylocalrank == 0) { + int affstring_sz = strlen(prefix_string) + strlen(unprefixed_affstring) + 16; + affstring = malloc(affstring_sz); + snprintf(affstring, affstring_sz, "%s%s", prefix_string, unprefixed_affstring); + free(unprefixed_affstring); + } + // Non-host-leaders return early. if (mylocalrank != 0) { ompi_comm_free(&local_comm); @@ -373,15 +413,18 @@ ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from fi // Only host-leaders exist from this point on. // ------------------------------------------------- myleaderrank = ompi_comm_rank(leader_comm); - nleaderranks = numhosts = ompi_comm_size(leader_comm); + nleaderranks = ompi_comm_size(leader_comm); /* * Allocate space for each rank to store its communication method * on a per-host basis. But rank 0 gets enough space to store the * data for all pairs of hosts. */ - method = malloc(numhosts * sizeof(int) * (hpmp_myrank?1:numhosts)); - if (!method) { + mymethod = method = malloc(nleaderranks * sizeof(int)); + if (hpmp_myrank == 0) { + method = malloc(nleaderranks * sizeof(int) * nleaderranks); + } + if (!method || !mymethod) { ompi_comm_free(&local_comm); ompi_comm_free(&leader_comm); return; @@ -412,7 +455,7 @@ ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from fi len = strlen(opal_process_info.nodename) + 100; hoststring = malloc(len + 1); - sprintf(hoststring, "Host %d [%s] ranks ", + snprintf(hoststring, len + 1, "Host %d [%s] ranks ", myleaderrank, opal_process_info.nodename); abbreviate_list_into_string(&hoststring[strlen(hoststring)], @@ -470,10 +513,10 @@ ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from fi MPI_Op_free(&myop); MPI_Type_free(&mydt); -// Each host leader fills in a "numhosts" sized array method[] of +// Each host leader fills in a "nleaderranks" sized array method[] of // how it communicates with each peer. for (i=0; i 1) { - method[i] = comm_method(local_comm, 1); + mymethod[i] = comm_method(local_comm, 1); } } } -// Gather the strings and the methods at rank 0. -// The gatherv of the strings takes a few steps since we have to get -// the sizes first and allocate the receiving string. - { - int len, *lens, *disps; - - len = strlen(hoststring) + 1; - if (myleaderrank == 0) { - lens = malloc(nleaderranks * sizeof(int)); - disps = malloc(nleaderranks * sizeof(int)); - } else { - lens = disps = NULL; - } - leader_comm->c_coll->coll_gather( - &len, 1, MPI_INT, - lens, 1, MPI_INT, - 0, leader_comm, leader_comm->c_coll->coll_gather_module); - if (myleaderrank == 0) { - int tlen = 0; - char *p; - for (i=0; ic_coll->coll_gatherv( - hoststring, strlen(hoststring) + 1, MPI_CHAR, - &allhoststrings[0][0], lens, disps, MPI_CHAR, - 0, leader_comm, leader_comm->c_coll->coll_gatherv_module); - } else { - // matching above call from rank 0, just &allhoststrings[0][0] - // isn't legal here, and those args aren't used at non-root anyway - leader_comm->c_coll->coll_gatherv( - hoststring, strlen(hoststring) + 1, MPI_CHAR, - NULL, NULL, NULL, MPI_CHAR, - 0, leader_comm, leader_comm->c_coll->coll_gatherv_module); - } - if (myleaderrank == 0) { - free(lens); - free(disps); - } -// and a simpler gather for the methods - leader_comm->c_coll->coll_gather( - method, nleaderranks, MPI_INT, - method, nleaderranks, MPI_INT, - 0, leader_comm, leader_comm->c_coll->coll_gather_module); - } - ompi_comm_free(&local_comm); - ompi_comm_free(&leader_comm); +// Gather the methods at rank 0. + leader_comm->c_coll->coll_gather( + mymethod, nleaderranks, MPI_INT, + method, nleaderranks, MPI_INT, + 0, leader_comm, leader_comm->c_coll->coll_gather_module); // Interception for testing purposes. Let rank-0 meddle with all its method[] // settings, this is only for testing, eg to make sure the printing comes out @@ -567,12 +561,31 @@ ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from fi // 1. the hoststring each host contributed // 2. the 2d table in method[] if it isn't too big // 3. summary of on/off host interconnect, and list the exceptions - if (myleaderrank == 0) { + // 1: hoststring for each host - for (i=0; i 0) { // if (!first) { // strcat(str, " /"); // } - sprintf(&str[strlen(str)], + snprintf(&str[strlen(str)], + 1024 - strlen(str), " [%dx %s]", method_count[k], comm_method_to_string(k)); @@ -874,8 +890,387 @@ ompi_report_comm_methods(int called_from_location) // 1 = from init, 2 = from fi } } + free(mymethod); + if (hpmp_myrank == 0) { + free(method); + } +} + +/* + * Input is a multiline string containing \n but not ending with one. + * Ex: + * "H1 aff: [......../......../......../........] Lrank 0-1\n + * [......../......../......../........] Lrank 2-3" + */ +static void +multiline_print(char *instr, int cols, int indentation) { + if (!instr) { return; } + if (instr[0] == 0) { return; } + + if (cols < 0) { cols = INT_MAX; } + + char *str = strdup(instr); + int len = strlen(str); + int i; + char *p1 = str; + char *p2; + char save; + int prev_line_completed = 1; + while (p1 < str + len) { + p2 = p1; + if (prev_line_completed) { + while (*p2 != 0 && *p2 != '\n' && p2-p1c_coll->coll_gather( + &len, 1, MPI_INT, + lens, 1, MPI_INT, + 0, leader_comm, leader_comm->c_coll->coll_gather_module); + if (myleaderrank == 0) { + int tlen = 0; + char *p; + for (i=0; ic_coll->coll_gatherv( + hoststring, strlen(hoststring) + 1, MPI_CHAR, + &allhoststrings[0][0], lens, disps, MPI_CHAR, + 0, leader_comm, leader_comm->c_coll->coll_gatherv_module); + } else { + // matching above call from rank 0, just &allhoststrings[0][0] + // isn't legal here, and those args aren't used at non-root anyway + leader_comm->c_coll->coll_gatherv( + hoststring, strlen(hoststring) + 1, MPI_CHAR, + NULL, NULL, NULL, MPI_CHAR, + 0, leader_comm, leader_comm->c_coll->coll_gatherv_module); + } + if (myleaderrank == 0) { + free(lens); + free(disps); + } + if (myleaderrank == 0) { + for (i=0; i= (int)(*affstring_cursz)) { \ + *affstring_cursz *= 1.25; \ + *affstring_cursz += (amount + 16); \ + *affstring = realloc(*affstring, *affstring_cursz); \ + } \ +} while (0) + +/* + * If the old --report-bindings output would have been this: + * [BB/../../..][../../../..] + * [../BB/../..][../../../..] + * [../../../..][BB/../../..] + * [../../../..][../BB/../..] + * this function returns a string + * [aa/bb/../..][cc/dd/../..] + * + * This function decides how many ranks' bitmasks to process based on + * 1. avoiding overlap, and + * 2. only using 52 letters a-zA-Z + * The *position argument should come in as 0 for the first call, and + * it gets updated to point to the next host needing to be processed. + * + * Each call to this function adds another line to an existing *affstring + * (realloc()ing if needed). The second-and-later lines get a carriage + * return in front of them. The last line does not end with a carriage + * return. + */ +static void +sprint_bitmaps(char **affstring, int *affstring_cursz, + int indentation, + hwloc_cpuset_t *cpus_forrank, int nranks, + int *position) +{ + int i, r; + int nchildren_done[32], depth, some_cores_printed_under_containing_obj = 0; + int nextchar; + hwloc_obj_t obj; + hwloc_bitmap_t cpus; + char *key = "abcdefghijklmnopqrstuvwxyzABCDEGHIJKLMNOPQRSTUVWXYZ"; + int rankA, rankB; + +/* pre-computing how many ranks to use + */ + cpus = hwloc_bitmap_alloc(); + r = *position; + hwloc_bitmap_copy(cpus, cpus_forrank[r]); + while (r < nranks-1) { + if (!hwloc_bitmap_intersects(cpus, cpus_forrank[r+1]) && + (r - *position + 1) < (int)strlen(key)) + { + ++r; + hwloc_bitmap_or(cpus, cpus, cpus_forrank[r]); + } else { + break; + } + } + hwloc_bitmap_free(cpus); +/* update position, save a rankA and rankB to note the range we'll use */ + rankA = *position; + rankB = r; + *position = *position + (rankB - rankA + 1); + + nextchar = strlen(*affstring); + LENGTHEN(affstring, nextchar, affstring_cursz, indentation + 1); + if (rankA != 0) { + (*affstring)[nextchar++] = '\n'; + (*affstring)[nextchar] = 0; + for (i=0; itype == HWLOC_OBJ_PACKAGE) { + (*affstring)[nextchar++] = '['; + (*affstring)[nextchar] = 0; + some_cores_printed_under_containing_obj = 0; + } + if (obj->type == HWLOC_OBJ_CORE && some_cores_printed_under_containing_obj) { + (*affstring)[nextchar++] = '/'; + (*affstring)[nextchar] = 0; + } + /* if this is a numa level it doesn't have .type NUMA, it just + * has a .memory_arity on the side. + * The below code prints the "<...>" for both interesting + * and non-interesting numas. Eg if each package is a numa + * this might print "[<../..>]". We could change this to only + * print the intersting cases by adding .type==GROUP here, + * because the GROUP .type is used when the numa level isn't a 1:1 + * match for some other obj in the tree. + * + * The iszero() check is for skipping numas that have no PUs under + * them. Without that check my PPC machines at least look like + * "[<../..>][<../..>]<><><><><><>" + */ +#if HWLOC_API_VERSION >= 0x20000 + if (obj->memory_arity > 0 && !hwloc_bitmap_iszero(obj->memory_first_child->cpuset)) { + (*affstring)[nextchar++] = '<'; + (*affstring)[nextchar] = 0; + some_cores_printed_under_containing_obj = 0; + } +#endif + + if (obj->type == HWLOC_OBJ_PU) { + char c = '.'; + for (r=rankA; r<=rankB; ++r) { + if (hwloc_bitmap_isset(cpus_forrank[r], obj->os_index)) { + c = key[r - rankA]; + break; + } + } + (*affstring)[nextchar++] = c; + (*affstring)[nextchar] = 0; + some_cores_printed_under_containing_obj = 1; + } + + +/* + * traverse up till we find an obj we haven't yet processed all the children for + */ + i = nchildren_done[depth]; + while (depth >= 0 && i >= (int)obj->arity) { + LENGTHEN(affstring, nextchar, affstring_cursz, 2); +#if HWLOC_API_VERSION >= 0x20000 + if (obj->memory_arity > 0 && !hwloc_bitmap_iszero(obj->memory_first_child->cpuset)) { + (*affstring)[nextchar++] = '>'; + (*affstring)[nextchar] = 0; + some_cores_printed_under_containing_obj = 0; + } +#endif + if (obj->type == HWLOC_OBJ_PACKAGE) { + (*affstring)[nextchar++] = ']'; + (*affstring)[nextchar] = 0; + some_cores_printed_under_containing_obj = 0; + } + + obj = obj->parent; + --depth; + if (depth >= 0) { i = nchildren_done[depth]; } + } +/* + * we're either ready to go to the next child of this obj, or we've walked off + * the top of the tree and are done + */ + if (obj && i < (int)obj->arity) { + obj = obj->children[i]; + ++nchildren_done[depth]; + ++depth; + nchildren_done[depth] = 0; + } else { + obj = NULL; + } + } +} +static void +host_local_get_affstring( + char **affstring, int indentation, + int mylocalrank, int nlocalranks, + ompi_communicator_t *local_comm) +{ + hwloc_cpuset_t mycpus; + hwloc_cpuset_t *cpus_forrank; + char *mybitmap_string; + char **bitmap_strings; + + /* ensure we have the topology */ + if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { + return; + } + + mycpus = hwloc_bitmap_alloc(); + hwloc_get_cpubind(opal_hwloc_topology, mycpus, HWLOC_CPUBIND_PROCESS); + hwloc_bitmap_list_asprintf(&mybitmap_string, mycpus); + hwloc_bitmap_free(mycpus); + +// Gather hwloc_bitmap_t strings at host-local rank 0. + int len, *lens, *disps, i; + + len = strlen(mybitmap_string) + 1; + if (mylocalrank == 0) { + lens = malloc(nlocalranks * sizeof(int)); + disps = malloc(nlocalranks * sizeof(int)); + } else { + lens = disps = NULL; + } + local_comm->c_coll->coll_gather( + &len, 1, MPI_INT, + lens, 1, MPI_INT, + 0, local_comm, local_comm->c_coll->coll_gather_module); + if (mylocalrank == 0) { + int tlen = 0; + char *p; + for (i=0; ic_coll->coll_gatherv( + mybitmap_string, len, MPI_CHAR, + &bitmap_strings[0][0], lens, disps, MPI_CHAR, + 0, local_comm, local_comm->c_coll->coll_gatherv_module); + } else { + // matching above call from rank 0, just &bitmap_strings[0][0] + // isn't legal here, and those args aren't used at non-root anyway + local_comm->c_coll->coll_gatherv( + mybitmap_string, len, MPI_CHAR, + NULL, NULL, NULL, MPI_CHAR, + 0, local_comm, local_comm->c_coll->coll_gatherv_module); + } + + free(mybitmap_string); + if (mylocalrank != 0) { return; } + +/* only the host leader runs the rest */ + free(lens); + free(disps); + + cpus_forrank = malloc(nlocalranks * sizeof(hwloc_cpuset_t)); + for (i=0; i