From 150eafd96c317d098c530212660ffb057f719848 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 5 Oct 2016 11:56:53 +0900 Subject: [PATCH 1/2] coll/libnbc: implement support for MPI_IN_PLACE in MPI_Ialltoall* Thanks Chris Ward for the report Many thanks to George for the guidance (cherry picked from commit open-mpi/ompi@1e0f591811f422175200b7ae2ba76c825d713489) --- ompi/mca/coll/libnbc/nbc_ialltoall.c | 88 +++++++++-- ompi/mca/coll/libnbc/nbc_ialltoallv.c | 218 ++++++++++++++++++++++---- ompi/mca/coll/libnbc/nbc_ialltoallw.c | 217 +++++++++++++++++++++---- 3 files changed, 451 insertions(+), 72 deletions(-) diff --git a/ompi/mca/coll/libnbc/nbc_ialltoall.c b/ompi/mca/coll/libnbc/nbc_ialltoall.c index d318e6e09c1..c9df894cc01 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoall.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoall.c @@ -8,7 +8,7 @@ * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler @@ -25,6 +25,8 @@ static inline int a2a_sched_pairwise(int rank, int p, MPI_Aint sndext, MPI_Aint static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule* schedule, const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm, NBC_Handle *handle); +static inline int a2a_sched_inplace(int rank, int p, NBC_Schedule* schedule, void* buf, int count, + MPI_Datatype type, MPI_Aint ext, ptrdiff_t gap, MPI_Comm comm); #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ @@ -59,9 +61,10 @@ int ompi_coll_libnbc_ialltoall(const void* sendbuf, int sendcount, MPI_Datatype NBC_Alltoall_args *args, *found, search; #endif char *rbuf, *sbuf, inplace; - enum {NBC_A2A_LINEAR, NBC_A2A_PAIRWISE, NBC_A2A_DISS} alg; + enum {NBC_A2A_LINEAR, NBC_A2A_PAIRWISE, NBC_A2A_DISS, NBC_A2A_INPLACE} alg; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; NBC_IN_PLACE(sendbuf, recvbuf, inplace); @@ -89,7 +92,9 @@ int ompi_coll_libnbc_ialltoall(const void* sendbuf, int sendcount, MPI_Datatype /* algorithm selection */ a2asize = sndsize * sendcount * p; /* this number is optimized for TCP on odin.cs.indiana.edu */ - if((p <= 8) && ((a2asize < 1<<17) || (sndsize*sendcount < 1<<12))) { + if (inplace) { + alg = NBC_A2A_INPLACE; + } else if((p <= 8) && ((a2asize < 1<<17) || (sndsize*sendcount < 1<<12))) { /* just send as fast as we can if we have less than 8 peers, if the * total communicated size is smaller than 1<<17 *and* if we don't * have eager messages (msgsize < 1<<13) */ @@ -116,7 +121,14 @@ int ompi_coll_libnbc_ialltoall(const void* sendbuf, int sendcount, MPI_Datatype } /* allocate temp buffer if we need one */ - if (alg == NBC_A2A_DISS) { + if (alg == NBC_A2A_INPLACE) { + span = opal_datatype_span(&recvtype->super, recvcount, &gap); + handle->tmpbuf = malloc(span); + if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { + NBC_Return_handle (handle); + return OMPI_ERR_OUT_OF_RESOURCE; + } + } else if (alg == NBC_A2A_DISS) { /* only A2A_DISS needs buffers */ if(NBC_Type_intrinsic(sendtype)) { datasize = sndext * sendcount; @@ -200,6 +212,9 @@ int ompi_coll_libnbc_ialltoall(const void* sendbuf, int sendcount, MPI_Datatype handle->schedule = schedule; switch(alg) { + case NBC_A2A_INPLACE: + res = a2a_sched_inplace(rank, p, schedule, recvbuf, recvcount, recvtype, rcvext, gap, comm); + break; case NBC_A2A_LINEAR: res = a2a_sched_linear(rank, p, sndext, rcvext, schedule, sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); break; @@ -359,17 +374,10 @@ static inline int a2a_sched_pairwise(int rank, int p, MPI_Aint sndext, MPI_Aint } char *sbuf = (char *) sendbuf + sndpeer * sendcount * sndext; - res = NBC_Sched_send (sbuf, false, sendcount, sendtype, sndpeer, schedule, false); + res = NBC_Sched_send (sbuf, false, sendcount, sendtype, sndpeer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - - if (r < p) { - res = NBC_Sched_barrier (schedule); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - } } return OMPI_SUCCESS; @@ -497,3 +505,59 @@ static inline int a2a_sched_diss(int rank, int p, MPI_Aint sndext, MPI_Aint rcve return OMPI_SUCCESS; } +static inline int a2a_sched_inplace(int rank, int p, NBC_Schedule* schedule, void* buf, int count, + MPI_Datatype type, MPI_Aint ext, ptrdiff_t gap, MPI_Comm comm) { + int res; + + for (int i = 1 ; i < (p+1)/2 ; i++) { + int speer = (rank + i) % p; + int rpeer = (rank + p - i) % p; + char *sbuf = (char *) buf + speer * count * ext; + char *rbuf = (char *) buf + rpeer * count * ext; + + res = NBC_Sched_copy (rbuf, false, count, type, + (void *)(-gap), true, count, type, + schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_send (sbuf, false , count, type, speer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_recv (rbuf, false , count, type, rpeer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + + res = NBC_Sched_send ((void *)(-gap), true, count, type, rpeer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_recv (sbuf, false, count, type, speer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + if (0 == (p%2)) { + int peer = (rank + p/2) % p; + + char *tbuf = (char *) buf + peer * count * ext; + res = NBC_Sched_copy (tbuf, false, count, type, + (void *)(-gap), true, count, type, + schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_send ((void *)(-gap), true , count, type, peer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_recv (tbuf, false , count, type, peer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/libnbc/nbc_ialltoallv.c b/ompi/mca/coll/libnbc/nbc_ialltoallv.c index 946b627ca77..77761623fdd 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoallv.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoallv.c @@ -5,7 +5,7 @@ * Corporation. All rights reserved. * Copyright (c) 2006 The Technical University of Chemnitz. All * rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. @@ -15,6 +15,22 @@ */ #include "nbc_internal.h" +static inline int a2av_sched_linear(int rank, int p, NBC_Schedule *schedule, + const void *sendbuf, const int *sendcounts, + const int *sdispls, MPI_Aint sndext, MPI_Datatype sendtype, + void *recvbuf, const int *recvcounts, + const int *rdispls, MPI_Aint rcvext, MPI_Datatype recvtype); + +static inline int a2av_sched_pairwise(int rank, int p, NBC_Schedule *schedule, + const void *sendbuf, const int *sendcounts, const int *sdispls, + MPI_Aint sndext, MPI_Datatype sendtype, + void *recvbuf, const int *recvcounts, const int *rdispls, + MPI_Aint rcvext, MPI_Datatype recvtype); + +static inline int a2av_sched_inplace(int rank, int p, NBC_Schedule *schedule, + void *buf, const int *counts, const int *displs, + MPI_Aint ext, MPI_Datatype type, ptrdiff_t gap); + /* an alltoallv schedule can not be cached easily because the contents * ot the recvcounts array may change, so a comparison of the address * would not be sufficient ... we simply do not cache it */ @@ -29,6 +45,7 @@ int ompi_coll_libnbc_ialltoallv(const void* sendbuf, const int *sendcounts, cons MPI_Aint sndext, rcvext; NBC_Schedule *schedule; char *rbuf, *sbuf, inplace; + ptrdiff_t gap, span; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; @@ -49,8 +66,28 @@ int ompi_coll_libnbc_ialltoallv(const void* sendbuf, const int *sendcounts, cons return res; } + res = NBC_Init_handle (comm, &handle, libnbc_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + /* copy data to receivbuffer */ - if ((sendcounts[rank] != 0) && !inplace) { + if (inplace) { + int count = 0; + for (int i = 0; i < p; i++) { + if (recvcounts[i] > count) { + count = recvcounts[i]; + } + } + span = opal_datatype_span(&recvtype->super, count, &gap); + handle->tmpbuf = malloc(span); + if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { + NBC_Return_handle (handle); + return OMPI_ERR_OUT_OF_RESOURCE; + } + sendcounts = recvcounts; + sdispls = rdispls; + } else if (sendcounts[rank] != 0) { rbuf = (char *) recvbuf + rdispls[rank] * rcvext; sbuf = (char *) sendbuf + sdispls[rank] * sndext; res = NBC_Copy (sbuf, sendcounts[rank], sendtype, rbuf, recvcounts[rank], recvtype, comm); @@ -61,43 +98,28 @@ int ompi_coll_libnbc_ialltoallv(const void* sendbuf, const int *sendcounts, cons schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { + NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } - for (int i = 0 ; i < p ; ++i) { - if (i == rank) { - continue; - } - - /* post all sends */ - if (sendcounts[i] != 0) { - sbuf = ((char *) sendbuf) + (sdispls[i] * sndext); - res = NBC_Sched_send(sbuf, false, sendcounts[i], sendtype, i, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - return res; - } - } - /* post all receives */ - if (recvcounts[i] != 0) { - rbuf = ((char *) recvbuf) + (rdispls[i] * rcvext); - res = NBC_Sched_recv(rbuf, false, recvcounts[i], recvtype, i, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - return res; - } - } + if (inplace) { + res = a2av_sched_inplace(rank, p, schedule, recvbuf, recvcounts, + rdispls, rcvext, recvtype, gap); + } else { + res = a2av_sched_linear(rank, p, schedule, + sendbuf, sendcounts, sdispls, sndext, sendtype, + recvbuf, recvcounts, rdispls, rcvext, recvtype); } - - res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); OBJ_RELEASE(schedule); return res; } - res = NBC_Init_handle (comm, &handle, libnbc_module); + res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); OBJ_RELEASE(schedule); return res; } @@ -123,7 +145,6 @@ int ompi_coll_libnbc_ialltoallv_inter (const void* sendbuf, const int *sendcount int res, rsize; MPI_Aint sndext, rcvext; NBC_Schedule *schedule; - char *rbuf, *sbuf; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; @@ -150,7 +171,7 @@ int ompi_coll_libnbc_ialltoallv_inter (const void* sendbuf, const int *sendcount for (int i = 0; i < rsize; i++) { /* post all sends */ if (sendcounts[i] != 0) { - sbuf = (char *) sendbuf + sdispls[i] * sndext; + char *sbuf = (char *) sendbuf + sdispls[i] * sndext; res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); @@ -159,7 +180,7 @@ int ompi_coll_libnbc_ialltoallv_inter (const void* sendbuf, const int *sendcount } /* post all receives */ if (recvcounts[i] != 0) { - rbuf = (char *) recvbuf + rdispls[i] * rcvext; + char *rbuf = (char *) recvbuf + rdispls[i] * rcvext; res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); @@ -191,3 +212,138 @@ int ompi_coll_libnbc_ialltoallv_inter (const void* sendbuf, const int *sendcount return OMPI_SUCCESS; } + +static inline int a2av_sched_linear(int rank, int p, NBC_Schedule *schedule, + const void *sendbuf, const int *sendcounts, const int *sdispls, + MPI_Aint sndext, MPI_Datatype sendtype, + void *recvbuf, const int *recvcounts, const int *rdispls, + MPI_Aint rcvext, MPI_Datatype recvtype) { + int res; + + for (int i = 0 ; i < p ; ++i) { + if (i == rank) { + continue; + } + + /* post send */ + if (sendcounts[i] != 0) { + char *sbuf = ((char *) sendbuf) + (sdispls[i] * sndext); + res = NBC_Sched_send(sbuf, false, sendcounts[i], sendtype, i, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + + /* post receive */ + if (recvcounts[i] != 0) { + char *rbuf = ((char *) recvbuf) + (rdispls[i] * rcvext); + res = NBC_Sched_recv(rbuf, false, recvcounts[i], recvtype, i, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + } + + return OMPI_SUCCESS; +} + +static inline int a2av_sched_pairwise(int rank, int p, NBC_Schedule *schedule, + const void *sendbuf, const int *sendcounts, const int *sdispls, + MPI_Aint sndext, MPI_Datatype sendtype, + void *recvbuf, const int *recvcounts, const int *rdispls, + MPI_Aint rcvext, MPI_Datatype recvtype) { + int res; + + for (int i = 1 ; i < p ; ++i) { + int sndpeer = (rank + i) % p; + int rcvpeer = (rank + p - i) %p; + + /* post send */ + if (sendcounts[sndpeer] != 0) { + char *sbuf = ((char *) sendbuf) + (sdispls[sndpeer] * sndext); + res = NBC_Sched_send(sbuf, false, sendcounts[sndpeer], sendtype, sndpeer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + + /* post receive */ + if (recvcounts[rcvpeer] != 0) { + char *rbuf = ((char *) recvbuf) + (rdispls[rcvpeer] * rcvext); + res = NBC_Sched_recv(rbuf, false, recvcounts[rcvpeer], recvtype, rcvpeer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + } + + return OMPI_SUCCESS; +} + +static inline int a2av_sched_inplace(int rank, int p, NBC_Schedule *schedule, + void *buf, const int *counts, const int *displs, + MPI_Aint ext, MPI_Datatype type, ptrdiff_t gap) { + int res; + + for (int i = 1; i < (p+1)/2; i++) { + int speer = (rank + i) % p; + int rpeer = (rank + p - i) % p; + char *sbuf = (char *) buf + displs[speer] * ext; + char *rbuf = (char *) buf + displs[rpeer] * ext; + + if (0 != counts[rpeer]) { + res = NBC_Sched_copy (rbuf, false, counts[rpeer], type, + (void *)(-gap), true, counts[rpeer], type, + schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + if (0 != counts[speer]) { + res = NBC_Sched_send (sbuf, false , counts[speer], type, speer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + if (0 != counts[rpeer]) { + res = NBC_Sched_recv (rbuf, false , counts[rpeer], type, rpeer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + + if (0 != counts[rpeer]) { + res = NBC_Sched_send ((void *)(-gap), true, counts[rpeer], type, rpeer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + if (0 != counts[speer]) { + res = NBC_Sched_recv (sbuf, false, counts[speer], type, speer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + } + if (0 == (p%2)) { + int peer = (rank + p/2) % p; + + char *tbuf = (char *) buf + displs[peer] * ext; + res = NBC_Sched_copy (tbuf, false, counts[peer], type, + (void *)(-gap), true, counts[peer], type, + schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_send ((void *)(-gap), true , counts[peer], type, peer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_recv (tbuf, false , counts[peer], type, peer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/libnbc/nbc_ialltoallw.c b/ompi/mca/coll/libnbc/nbc_ialltoallw.c index 8f64ec3220a..ec29a3a355f 100644 --- a/ompi/mca/coll/libnbc/nbc_ialltoallw.c +++ b/ompi/mca/coll/libnbc/nbc_ialltoallw.c @@ -5,7 +5,7 @@ * Corporation. All rights reserved. * Copyright (c) 2006 The Technical University of Chemnitz. All * rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. @@ -15,6 +15,22 @@ */ #include "nbc_internal.h" +static inline int a2aw_sched_linear(int rank, int p, NBC_Schedule *schedule, + const void *sendbuf, const int *sendcounts, const int *sdispls, + struct ompi_datatype_t * const * sendtypes, + void *recvbuf, const int *recvcounts, const int *rdispls, + struct ompi_datatype_t * const * recvtypes); + +static inline int a2aw_sched_pairwise(int rank, int p, NBC_Schedule *schedule, + const void *sendbuf, const int *sendcounts, const int *sdispls, + struct ompi_datatype_t * const * sendtypes, + void *recvbuf, const int *recvcounts, const int *rdispls, + struct ompi_datatype_t * const * recvtypes); + +static inline int a2aw_sched_inplace(int rank, int p, NBC_Schedule *schedule, + void *buf, const int *counts, const int *displs, + struct ompi_datatype_t * const * types); + /* an alltoallw schedule can not be cached easily because the contents * ot the recvcounts array may change, so a comparison of the address * would not be sufficient ... we simply do not cache it */ @@ -28,6 +44,7 @@ int ompi_coll_libnbc_ialltoallw(const void* sendbuf, const int *sendcounts, cons int rank, p, res; NBC_Schedule *schedule; char *rbuf, *sbuf, inplace; + ptrdiff_t span=0; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; @@ -36,8 +53,29 @@ int ompi_coll_libnbc_ialltoallw(const void* sendbuf, const int *sendcounts, cons rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); + res = NBC_Init_handle (comm, &handle, libnbc_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + /* copy data to receivbuffer */ - if ((sendcounts[rank] != 0) && !inplace) { + if (inplace) { + ptrdiff_t lgap, lspan; + for (int i = 0; i < p; i++) { + lspan = opal_datatype_span(&recvtypes[i]->super, recvcounts[i], &lgap); + if (lspan > span) { + span = lspan; + } + } + handle->tmpbuf = malloc(span); + if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { + NBC_Return_handle (handle); + return OMPI_ERR_OUT_OF_RESOURCE; + } + sendcounts = recvcounts; + sdispls = rdispls; + sendtypes = recvtypes; + } else if (sendcounts[rank] != 0) { rbuf = (char *) recvbuf + rdispls[rank]; sbuf = (char *) sendbuf + sdispls[rank]; res = NBC_Copy(sbuf, sendcounts[rank], sendtypes[rank], rbuf, recvcounts[rank], recvtypes[rank], comm); @@ -48,42 +86,27 @@ int ompi_coll_libnbc_ialltoallw(const void* sendbuf, const int *sendcounts, cons schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { + NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } - for (int i = 0; i < p; i++) { - if (i == rank) { - continue; - } - - /* post all sends */ - if (sendcounts[i] != 0) { - sbuf = (char *) sendbuf + sdispls[i]; - res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - return res; - } - } - /* post all receives */ - if (recvcounts[i] != 0) { - rbuf = (char *) recvbuf + rdispls[i]; - res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - return res; - } - } + if (inplace) { + res = a2aw_sched_inplace(rank, p, schedule, recvbuf, + recvcounts, rdispls, recvtypes); + } else { + res = a2aw_sched_linear(rank, p, schedule, + sendbuf, sendcounts, sdispls, sendtypes, + recvbuf, recvcounts, rdispls, recvtypes); } - - res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); OBJ_RELEASE(schedule); return res; } - res = NBC_Init_handle (comm, &handle, libnbc_module); + res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); OBJ_RELEASE(schedule); return res; } @@ -97,7 +120,7 @@ int ompi_coll_libnbc_ialltoallw(const void* sendbuf, const int *sendcounts, cons *request = (ompi_request_t *) handle; return OMPI_SUCCESS; -} +} /* simple linear Alltoallw */ int ompi_coll_libnbc_ialltoallw_inter (const void* sendbuf, const int *sendcounts, const int *sdispls, @@ -161,3 +184,139 @@ int ompi_coll_libnbc_ialltoallw_inter (const void* sendbuf, const int *sendcount return OMPI_SUCCESS; } + +static inline int a2aw_sched_linear(int rank, int p, NBC_Schedule *schedule, + const void *sendbuf, const int *sendcounts, const int *sdispls, + struct ompi_datatype_t * const * sendtypes, + void *recvbuf, const int *recvcounts, const int *rdispls, + struct ompi_datatype_t * const * recvtypes) { + int res; + + for (int i = 0; i < p; i++) { + if (i == rank) { + continue; + } + + /* post send */ + if (sendcounts[i] != 0) { + char *sbuf = (char *) sendbuf + sdispls[i]; + res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + /* post receive */ + if (recvcounts[i] != 0) { + char *rbuf = (char *) recvbuf + rdispls[i]; + res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + } + + return OMPI_SUCCESS; +} + +static inline int a2aw_sched_pairwise(int rank, int p, NBC_Schedule *schedule, + const void *sendbuf, const int *sendcounts, const int *sdispls, + struct ompi_datatype_t * const * sendtypes, + void *recvbuf, const int *recvcounts, const int *rdispls, + struct ompi_datatype_t * const * recvtypes) { + int res; + + for (int i = 1; i < p; i++) { + int sndpeer = (rank + i) % p; + int rcvpeer = (rank + p - i) % p; + + /* post send */ + if (sendcounts[sndpeer] != 0) { + char *sbuf = (char *) sendbuf + sdispls[sndpeer]; + res = NBC_Sched_send (sbuf, false, sendcounts[sndpeer], sendtypes[sndpeer], sndpeer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + /* post receive */ + if (recvcounts[rcvpeer] != 0) { + char *rbuf = (char *) recvbuf + rdispls[rcvpeer]; + res = NBC_Sched_recv (rbuf, false, recvcounts[rcvpeer], recvtypes[rcvpeer], rcvpeer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + } + + return OMPI_SUCCESS; +} + +static inline int a2aw_sched_inplace(int rank, int p, NBC_Schedule *schedule, + void *buf, const int *counts, const int *displs, + struct ompi_datatype_t * const * types) { + ptrdiff_t gap; + int res; + + for (int i = 1; i < (p+1)/2; i++) { + int speer = (rank + i) % p; + int rpeer = (rank + p - i) % p; + char *sbuf = (char *) buf + displs[speer]; + char *rbuf = (char *) buf + displs[rpeer]; + + if (0 != counts[rpeer]) { + (void)opal_datatype_span(&types[rpeer]->super, counts[rpeer], &gap); + res = NBC_Sched_copy (rbuf, false, counts[rpeer], types[rpeer], + (void *)(-gap), true, counts[rpeer], types[rpeer], + schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + if (0 != counts[speer]) { + res = NBC_Sched_send (sbuf, false , counts[speer], types[speer], speer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + if (0 != counts[rpeer]) { + res = NBC_Sched_recv (rbuf, false , counts[rpeer], types[rpeer], rpeer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + + if (0 != counts[rpeer]) { + res = NBC_Sched_send ((void *)(-gap), true, counts[rpeer], types[rpeer], rpeer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + if (0 != counts[speer]) { + res = NBC_Sched_recv (sbuf, false, counts[speer], types[speer], speer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + } + if (0 == (p%2)) { + int peer = (rank + p/2) % p; + + char *tbuf = (char *) buf + displs[peer]; + (void)opal_datatype_span(&types[peer]->super, counts[peer], &gap); + res = NBC_Sched_copy (tbuf, false, counts[peer], types[peer], + (void *)(-gap), true, counts[peer], types[peer], + schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_send ((void *)(-gap), true , counts[peer], types[peer], peer, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + res = NBC_Sched_recv (tbuf, false , counts[peer], types[peer], peer, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + + return OMPI_SUCCESS; +} From 2a55cb123ee17c7096ed8cdd0cd183f04e6ff143 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 5 Oct 2016 11:57:45 +0900 Subject: [PATCH 2/2] ompi: accept MPI_IN_PLACE in MPI_Ialltoall* (cherry picked from commit open-mpi/ompi@7cae36f5ab6735239a91632d3c4e7f5a84d9d930) --- ompi/mpi/c/ialltoall.c | 2 +- ompi/mpi/c/ialltoallv.c | 2 +- ompi/mpi/c/ialltoallw.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ompi/mpi/c/ialltoall.c b/ompi/mpi/c/ialltoall.c index f7b59f9b22c..e1bbb234a5c 100644 --- a/ompi/mpi/c/ialltoall.c +++ b/ompi/mpi/c/ialltoall.c @@ -14,7 +14,7 @@ * Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * diff --git a/ompi/mpi/c/ialltoallv.c b/ompi/mpi/c/ialltoallv.c index 6876548c8e2..c28136bfbaf 100644 --- a/ompi/mpi/c/ialltoallv.c +++ b/ompi/mpi/c/ialltoallv.c @@ -13,7 +13,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * diff --git a/ompi/mpi/c/ialltoallw.c b/ompi/mpi/c/ialltoallw.c index 18e9fb523d5..59aa13bba0c 100644 --- a/ompi/mpi/c/ialltoallw.c +++ b/ompi/mpi/c/ialltoallw.c @@ -13,7 +13,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ *