Skip to content

Commit 76204df

Browse files
committed
coll/basic: fix segmentation fault in neighborhood collectives if the degree
of the topology is higher than the communicator size It is possible to have a topology degree higher than the size of the communicator. For example, a periodic cartesian communicator on MPI_COMM_SELF. This will leave the neighborhood collectives with a request buffer that is too small. This commits introduces a semantic change : from now, c_topo must be set before invoking coll_select
1 parent 2f67f29 commit 76204df

File tree

4 files changed

+107
-24
lines changed

4 files changed

+107
-24
lines changed

ompi/mca/coll/basic/coll_basic_module.c

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
1414
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
1515
* reserved.
16+
* Copyright (c) 2014 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1618
* $COPYRIGHT$
1719
*
1820
* Additional copyrights may follow
@@ -28,6 +30,8 @@
2830
#include "mpi.h"
2931
#include "ompi/mca/coll/coll.h"
3032
#include "ompi/mca/coll/base/base.h"
33+
#include "ompi/mca/topo/topo.h"
34+
#include "ompi/mca/topo/base/base.h"
3135
#include "coll_basic.h"
3236

3337

@@ -70,7 +74,36 @@ mca_coll_basic_comm_query(struct ompi_communicator_t *comm,
7074
} else {
7175
size = ompi_comm_size(comm);
7276
}
73-
basic_module->mccb_num_reqs = size * 2;
77+
size *= 2;
78+
if (OMPI_COMM_IS_CART(comm)) {
79+
int cart_size;
80+
mca_topo_base_comm_cart_2_2_0_t *cart;
81+
assert (NULL != comm->c_topo);
82+
cart = comm->c_topo->mtc.cart;
83+
cart_size = cart->ndims * 4;
84+
if (cart_size > size) {
85+
size = cart_size;
86+
}
87+
} else if (OMPI_COMM_IS_GRAPH(comm)) {
88+
int rank, degree;
89+
assert (NULL != comm->c_topo);
90+
rank = ompi_comm_rank (comm);
91+
mca_topo_base_graph_neighbors_count (comm, rank, &degree);
92+
degree *= 2;
93+
if (degree > size) {
94+
size = degree;
95+
}
96+
} else if (OMPI_COMM_IS_DIST_GRAPH(comm)) {
97+
int dist_graph_size;
98+
mca_topo_base_comm_dist_graph_2_2_0_t *dist_graph;
99+
assert (NULL != comm->c_topo);
100+
dist_graph = comm->c_topo->mtc.dist_graph;
101+
dist_graph_size = dist_graph->indegree + dist_graph->outdegree;
102+
if (dist_graph_size > size) {
103+
size = dist_graph_size;
104+
}
105+
}
106+
basic_module->mccb_num_reqs = size;
74107
basic_module->mccb_reqs = (ompi_request_t**)
75108
malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs);
76109

ompi/mca/topo/base/topo_base_cart_create.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,20 +162,24 @@ int mca_topo_base_cart_create(mca_topo_base_module_t *topo,
162162
return MPI_ERR_INTERN;
163163
}
164164

165+
assert(NULL == new_comm->c_topo);
166+
assert(!(new_comm->c_flags & OMPI_COMM_CART));
167+
new_comm->c_topo = topo;
168+
new_comm->c_topo->mtc.cart = cart;
169+
new_comm->c_topo->reorder = reorder;
170+
new_comm->c_flags |= OMPI_COMM_CART;
165171
ret = ompi_comm_enable(old_comm, new_comm,
166172
new_rank, num_procs, topo_procs);
167173
if (OMPI_SUCCESS != ret) {
168174
/* something wrong happened during setting the communicator */
175+
new_comm->c_topo = NULL;
176+
new_comm->c_flags &= ~OMPI_COMM_CART;
169177
free(topo_procs);
170178
OBJ_RELEASE(cart);
171179
ompi_comm_free (&new_comm);
172180
return ret;
173181
}
174182

175-
new_comm->c_topo = topo;
176-
new_comm->c_topo->mtc.cart = cart;
177-
new_comm->c_topo->reorder = reorder;
178-
new_comm->c_flags |= OMPI_COMM_CART;
179183
*comm_topo = new_comm;
180184

181185
if( MPI_UNDEFINED == new_rank ) {

ompi/mca/topo/base/topo_base_dist_graph_create.c

Lines changed: 57 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -288,28 +288,71 @@ int mca_topo_base_dist_graph_create(mca_topo_base_module_t* module,
288288
{
289289
int err;
290290

291-
if( OMPI_SUCCESS != (err = ompi_comm_create(comm_old,
292-
comm_old->c_local_group,
293-
newcomm)) ) {
294-
OBJ_RELEASE(module);
295-
return err;
291+
ompi_proc_t **topo_procs = NULL;
292+
int num_procs, ret, rank, i;
293+
ompi_communicator_t *new_comm;
294+
mca_topo_base_comm_dist_graph_2_2_0_t* topo;
295+
num_procs = ompi_comm_size(comm_old);
296+
rank = ompi_comm_rank(comm_old);
297+
topo_procs = (ompi_proc_t**)malloc(num_procs * sizeof(ompi_proc_t *));
298+
if(OMPI_GROUP_IS_DENSE(comm_old->c_local_group)) {
299+
memcpy(topo_procs,
300+
comm_old->c_local_group->grp_proc_pointers,
301+
num_procs * sizeof(ompi_proc_t *));
302+
} else {
303+
for(i = 0 ; i < num_procs; i++) {
304+
topo_procs[i] = ompi_group_peer_lookup(comm_old->c_local_group,i);
305+
}
306+
}
307+
new_comm = ompi_comm_allocate(num_procs, 0);
308+
if (NULL == new_comm) {
309+
free(topo_procs);
310+
return OMPI_ERR_OUT_OF_RESOURCE;
296311
}
297-
298-
assert(NULL == (*newcomm)->c_topo);
299-
(*newcomm)->c_topo = module;
300-
(*newcomm)->c_topo->reorder = reorder;
301-
(*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH;
302-
303312
err = mca_topo_base_dist_graph_distribute(module,
304-
*newcomm,
313+
comm_old,
305314
n, nodes,
306315
degrees, targets,
307316
weights,
308-
&((*newcomm)->c_topo->mtc.dist_graph));
317+
&topo);
309318
if( OMPI_SUCCESS != err ) {
319+
free(topo_procs);
310320
ompi_comm_free(newcomm);
321+
return err;
311322
}
312-
return err;
323+
324+
assert(NULL == new_comm->c_topo);
325+
new_comm->c_topo = module;
326+
new_comm->c_topo->reorder = reorder;
327+
new_comm->c_flags |= OMPI_COMM_DIST_GRAPH;
328+
new_comm->c_topo->mtc.dist_graph = topo;
329+
330+
ret = ompi_comm_enable(comm_old, new_comm,
331+
rank, num_procs, topo_procs);
332+
if (OMPI_SUCCESS != ret) {
333+
if ( NULL != topo->in ) {
334+
free(topo->in);
335+
}
336+
if ( NULL != topo->out ) {
337+
free(topo->out);
338+
}
339+
if ( NULL != topo->inw ) {
340+
free(topo->inw);
341+
}
342+
if ( NULL != topo->outw ) {
343+
free(topo->outw);
344+
}
345+
free(topo);
346+
free(topo_procs);
347+
new_comm->c_topo = NULL;
348+
new_comm->c_flags &= ~OMPI_COMM_DIST_GRAPH;
349+
new_comm->c_topo->mtc.dist_graph = NULL;
350+
ompi_comm_free (&new_comm);
351+
return ret;
352+
}
353+
*newcomm = new_comm;
354+
355+
return OMPI_SUCCESS;
313356
}
314357

315358
static void mca_topo_base_comm_dist_graph_2_2_0_construct(mca_topo_base_comm_dist_graph_2_2_0_t * dist_graph) {

ompi/mca/topo/base/topo_base_graph_create.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2012-2013 Inria. All rights reserved.
13-
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2014 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* $COPYRIGHT$
@@ -123,19 +123,22 @@ int mca_topo_base_graph_create(mca_topo_base_module_t *topo,
123123
return OMPI_ERR_OUT_OF_RESOURCE;
124124
}
125125

126+
new_comm->c_topo = topo;
127+
new_comm->c_topo->mtc.graph = graph;
128+
new_comm->c_flags |= OMPI_COMM_GRAPH;
129+
new_comm->c_topo->reorder = reorder;
130+
126131
ret = ompi_comm_enable(old_comm, new_comm,
127132
new_rank, num_procs, topo_procs);
128133
if (OMPI_SUCCESS != ret) {
134+
new_comm->c_topo = NULL;
135+
new_comm->c_flags &= ~OMPI_COMM_GRAPH;
129136
free(topo_procs);
130137
OBJ_RELEASE(graph);
131138
ompi_comm_free (&new_comm);
132139
return ret;
133140
}
134141

135-
new_comm->c_topo = topo;
136-
new_comm->c_topo->mtc.graph = graph;
137-
new_comm->c_flags |= OMPI_COMM_GRAPH;
138-
new_comm->c_topo->reorder = reorder;
139142
*comm_topo = new_comm;
140143

141144
if( MPI_UNDEFINED == new_rank ) {

0 commit comments

Comments
 (0)