Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 76 additions & 12 deletions ompi/mca/coll/libnbc/nbc_ialltoall.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
*
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
Expand All @@ -25,6 +25,8 @@ static inline int a2a_sched_pairwise(int rank, int p, MPI_Aint sndext, MPI_Aint
static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule* schedule,
const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf,
int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle);
static inline int a2a_sched_inplace(int rank, int p, NBC_Schedule* schedule, void* buf, int count,
MPI_Datatype type, MPI_Aint ext, ptrdiff_t gap, MPI_Comm comm);

#ifdef NBC_CACHE_SCHEDULE
/* tree comparison function for schedule cache */
Expand Down Expand Up @@ -59,9 +61,10 @@ int ompi_coll_libnbc_ialltoall(const void* sendbuf, int sendcount, MPI_Datatype
NBC_Alltoall_args *args, *found, search;
#endif
char *rbuf, *sbuf, inplace;
enum {NBC_A2A_LINEAR, NBC_A2A_PAIRWISE, NBC_A2A_DISS} alg;
enum {NBC_A2A_LINEAR, NBC_A2A_PAIRWISE, NBC_A2A_DISS, NBC_A2A_INPLACE} alg;
NBC_Handle *handle;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
ptrdiff_t span, gap;

NBC_IN_PLACE(sendbuf, recvbuf, inplace);

Expand Down Expand Up @@ -89,7 +92,9 @@ int ompi_coll_libnbc_ialltoall(const void* sendbuf, int sendcount, MPI_Datatype
/* algorithm selection */
a2asize = sndsize * sendcount * p;
/* this number is optimized for TCP on odin.cs.indiana.edu */
if((p <= 8) && ((a2asize < 1<<17) || (sndsize*sendcount < 1<<12))) {
if (inplace) {
alg = NBC_A2A_INPLACE;
} else if((p <= 8) && ((a2asize < 1<<17) || (sndsize*sendcount < 1<<12))) {
/* just send as fast as we can if we have less than 8 peers, if the
* total communicated size is smaller than 1<<17 *and* if we don't
* have eager messages (msgsize < 1<<13) */
Expand All @@ -116,7 +121,14 @@ int ompi_coll_libnbc_ialltoall(const void* sendbuf, int sendcount, MPI_Datatype
}

/* allocate temp buffer if we need one */
if (alg == NBC_A2A_DISS) {
if (alg == NBC_A2A_INPLACE) {
span = opal_datatype_span(&recvtype->super, recvcount, &gap);
handle->tmpbuf = malloc(span);
if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) {
NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE;
}
} else if (alg == NBC_A2A_DISS) {
/* only A2A_DISS needs buffers */
if(NBC_Type_intrinsic(sendtype)) {
datasize = sndext * sendcount;
Expand Down Expand Up @@ -200,6 +212,9 @@ int ompi_coll_libnbc_ialltoall(const void* sendbuf, int sendcount, MPI_Datatype
handle->schedule = schedule;

switch(alg) {
case NBC_A2A_INPLACE:
res = a2a_sched_inplace(rank, p, schedule, recvbuf, recvcount, recvtype, rcvext, gap, comm);
break;
case NBC_A2A_LINEAR:
res = a2a_sched_linear(rank, p, sndext, rcvext, schedule, sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm);
break;
Expand Down Expand Up @@ -359,17 +374,10 @@ static inline int a2a_sched_pairwise(int rank, int p, MPI_Aint sndext, MPI_Aint
}

char *sbuf = (char *) sendbuf + sndpeer * sendcount * sndext;
res = NBC_Sched_send (sbuf, false, sendcount, sendtype, sndpeer, schedule, false);
res = NBC_Sched_send (sbuf, false, sendcount, sendtype, sndpeer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}

if (r < p) {
res = NBC_Sched_barrier (schedule);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
}
}

return OMPI_SUCCESS;
Expand Down Expand Up @@ -497,3 +505,59 @@ static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcve
return OMPI_SUCCESS;
}

static inline int a2a_sched_inplace(int rank, int p, NBC_Schedule* schedule, void* buf, int count,
MPI_Datatype type, MPI_Aint ext, ptrdiff_t gap, MPI_Comm comm) {
int res;

for (int i = 1 ; i < (p+1)/2 ; i++) {
int speer = (rank + i) % p;
int rpeer = (rank + p - i) % p;
char *sbuf = (char *) buf + speer * count * ext;
char *rbuf = (char *) buf + rpeer * count * ext;

res = NBC_Sched_copy (rbuf, false, count, type,
(void *)(-gap), true, count, type,
schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
res = NBC_Sched_send (sbuf, false , count, type, speer, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
res = NBC_Sched_recv (rbuf, false , count, type, rpeer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}

res = NBC_Sched_send ((void *)(-gap), true, count, type, rpeer, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
res = NBC_Sched_recv (sbuf, false, count, type, speer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
}
if (0 == (p%2)) {
int peer = (rank + p/2) % p;

char *tbuf = (char *) buf + peer * count * ext;
res = NBC_Sched_copy (tbuf, false, count, type,
(void *)(-gap), true, count, type,
schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
res = NBC_Sched_send ((void *)(-gap), true , count, type, peer, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
res = NBC_Sched_recv (tbuf, false , count, type, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
}

return OMPI_SUCCESS;
}
Loading