diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..acb7661 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "fftw3"] + path = fftw3 + url = https://github.com/rainwoodman/fftw3 diff --git a/Makefile.am b/Makefile.am index eb1ccd6..ef5dc59 100644 --- a/Makefile.am +++ b/Makefile.am @@ -16,7 +16,7 @@ AM_DISTCHECK_CONFIGURE_FLAGS = \ "LDFLAGS=@LDFLAGS@" # Traverse these subdirectories, the current one first (for config.h). -SUBDIRS = kernel gcell util api . +SUBDIRS = fftw3 kernel gcell util api . if ENABLE_TESTS SUBDIRS += tests @@ -46,7 +46,7 @@ lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_LIBADD = \ gcell/libgcell.la lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_LDFLAGS = -no-undefined -version-info @SHARED_VERSION_INFO@ -lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_CFLAGS = $(OPENMP_CFLAGS) +lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@_la_CFLAGS = $(OPENMP_CFLAGS) -I $(top_srcdir)/mpi -I $(top_srcdir)/api # Get Fortran compile rules that include preprocessing. include $(top_srcdir)/build-aux/fortran-rules.am @@ -58,6 +58,8 @@ pfft1@PREC_SUFFIX@@OPENMP_SUFFIX@.pc: pfft.pc pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = pfft.pc +DISTCHECK_CONFIGURE_FLAGS=--disable-shared --disable-doc --enable-mpi --enable-fortran + ################################################################# # Documentation ################################################################# diff --git a/api/Makefile.am b/api/Makefile.am index 596a3cb..6b2ff47 100644 --- a/api/Makefile.am +++ b/api/Makefile.am @@ -3,6 +3,10 @@ AM_CPPFLAGS = -I$(top_srcdir)/kernel # Directory of util.h AM_CPPFLAGS += -I$(top_srcdir)/util +# +# Directory of fftw3 +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi # OpenMP support AM_CFLAGS = $(OPENMP_CFLAGS) diff --git a/api/api-adv.c b/api/api-adv.c index 596760e..e1edca8 100644 --- a/api/api-adv.c +++ b/api/api-adv.c @@ -22,11 +22,6 @@ #include "ipfft.h" #include "util.h" -static void calculate_3dto2d_blocks( - const INT *n, MPI_Comm comm_cart_3d, - INT *iblk); - - INT PX(local_size_many_dft)( int rnk_n, const INT *n, const INT *ni, const INT *no, INT howmany, const INT *iblock, const INT *oblock, @@ -238,7 +233,7 @@ PX(gcplan) PX(plan_many_rgc)( ) { int rnk_pm; - INT blk_3dto2d[3]; + INT blk_for_remap[3]; MPI_Comm *comms_pm; PX(gcplan) ths; MPI_Comm comm_cart = PX(assure_cart_comm)(comm); @@ -256,13 +251,13 @@ PX(gcplan) PX(plan_many_rgc)( comms_pm = (MPI_Comm*) malloc(sizeof(MPI_Comm) * (size_t) rnk_pm); PX(split_cart_procmesh)(comm_cart, comms_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ){ + if( PX(needs_remap_nd)(rnk_n, comm_cart) ){ /* 3d to 2d remap results in complicated blocks. * We ignore users input and use default block size. */ - calculate_3dto2d_blocks(pn, comm_cart, - blk_3dto2d); + PX(remap_nd_calculate_blocks)(rnk_n, pn, comm_cart, + blk_for_remap); ths = PX(plan_rgc_internal)( - rnk_n, pn, howmany, blk_3dto2d, gc_below, gc_above, + rnk_n, pn, howmany, blk_for_remap, gc_below, gc_above, data, rnk_pm, comms_pm, comm_cart, gc_flags); } else ths = PX(plan_rgc_internal)( @@ -279,19 +274,6 @@ PX(gcplan) PX(plan_many_rgc)( return ths; } -static void calculate_3dto2d_blocks( - const INT *n, MPI_Comm comm_cart_3d, - INT *iblk - ) -{ - int q0, q1, p0, p1; - INT mblk[3], oblk[3]; - - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - PX(default_block_size_3dto2d)(n, p0, p1, q0, q1, - iblk, mblk, oblk); -} - PX(gcplan) PX(plan_many_cgc)( int rnk_n, const INT *n, INT howmany, const INT *block, diff --git a/api/api-basic.c b/api/api-basic.c index aab76be..36b9d86 100644 --- a/api/api-basic.c +++ b/api/api-basic.c @@ -177,7 +177,6 @@ static R check_array( case PFFTI_ARRAYTYPE_HERMITIAN_COMPLEX: err = cabs( ((C*)data)[k] - 0.5 * (d1 + conj(d2))); break; } - if( err > maxerr ) maxerr = err; } @@ -258,7 +257,7 @@ static INT plain_index( INT k=0; for(INT t=0; tin, ths->out, in, out); PFFT_FINISH_TIMING(ths->timer->itwiddle); - PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_3dto2d[0]); - PX(execute_remap_3dto2d)(ths->remap_3dto2d[0], ths->in, ths->out, in, out); - PFFT_FINISH_TIMING(ths->timer->remap_3dto2d[0]); + PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_nd[0]); + PX(execute_remap_nd)(ths->remap_nd[0], ths->in, ths->out, in, out); + PFFT_FINISH_TIMING(ths->timer->remap_nd[0]); + execute_transposed(r, ths->serial_trafo, ths->global_remap, ths->timer->trafo, ths->timer->remap, ths->in, ths->out, in, out, ths->comm_cart); execute_transposed(r, &ths->serial_trafo[r+1], &ths->global_remap[r], &ths->timer->trafo[r+1], &ths->timer->remap[r], ths->in, ths->out, in, out, ths->comm_cart); - - PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_3dto2d[1]); - PX(execute_remap_3dto2d)(ths->remap_3dto2d[1], ths->in, ths->out, in, out); - PFFT_FINISH_TIMING(ths->timer->remap_3dto2d[1]); + + PFFT_START_TIMING(ths->comm_cart, ths->timer->remap_nd[1]); + + PX(execute_remap_nd)(ths->remap_nd[1], ths->in, ths->out, in, out); + + PFFT_FINISH_TIMING(ths->timer->remap_nd[1]); /* twiddle outputs in order to get inputs shifted by n/2 */ PFFT_START_TIMING(ths->comm_cart, ths->timer->otwiddle); diff --git a/api/pfft.h b/api/pfft.h index 856c2fe..d36841e 100644 --- a/api/pfft.h +++ b/api/pfft.h @@ -499,8 +499,19 @@ BEGIN_C_DECLS PFFT_EXTERN PX(gctimer) PX(convert_vec2gctimer)( \ const double *times); \ PFFT_EXTERN void PX(destroy_gctimer)( \ - PX(gctimer) ths); - + PX(gctimer) ths); \ + \ + PFFT_EXTERN R * PX(gather_array)(const int rnk_n, const int howmany, \ + const R * local, \ + const INT* local_start, \ + const INT * local_n, \ + const INT * global_n, \ + MPI_Comm comm); \ + void PX(print_array)(int rnk_n, \ + int howmany, \ + R * a, \ + const INT * ni, \ + MPI_Comm comm); \ #define PFFT_MANGLE_DOUBLE(name) PFFT_CONCAT(pfft_, name) diff --git a/bootstrap.sh b/bootstrap.sh index 2d34044..b37d5b9 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -35,12 +35,12 @@ alias libtoolize=$(type -p glibtoolize libtoolize | head -1) touch ChangeLog +touch fftw3/ChangeLog echo "PLEASE IGNORE WARNINGS AND ERRORS" - rm -rf autom4te.cache libtoolize -autoreconf --verbose --install --force +autoreconf --verbose --install --symlink --force rm -f config.cache diff --git a/configure.ac b/configure.ac index 796603c..9683c4f 100644 --- a/configure.ac +++ b/configure.ac @@ -31,7 +31,7 @@ AC_PREREQ(2.64) # autoconf initialization -AC_INIT([PFFT],[1.0.8-alpha],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) +AC_INIT([PFFT],[1.0.8-alpha3-fftw3],[michael.pippig@mathematik.tu-chemnitz.de],[pfft],[http://www.tu-chemnitz.de/~mpip/software/]) m4_ifndef([AC_PACKAGE_URL], [AC_SUBST([PACKAGE_URL], [http://www.tu-chemnitz.de/~mpip/software/])]) @@ -263,6 +263,7 @@ SHARED_VERSION_INFO="0:0:0" # substitute SHARED_VERSION_INFO in generated Makefiles AC_SUBST(SHARED_VERSION_INFO) +AC_DISABLE_SHARED dnl to hell with shared libraries ################################################################################ # program checks @@ -280,37 +281,6 @@ AC_SUBST(SHARED_VERSION_INFO) # May need sincos from libm. AC_CHECK_LIB([m], [sincos]) -# Check for FFTW3, MPI FFTW and threaded FFTW. -if test "x$PRECISION" = "xs" ; then - AX_LIB_FFTW3F -elif test "x$PRECISION" = "xl" ; then - AX_LIB_FFTW3L -else - AX_LIB_FFTW3 -fi - -if test "x$ax_lib_fftw3" = "xno"; then - AC_MSG_ERROR([You do not seem to have the FFTW-3.3 library installed.] - [You can download it from http://www.fftw.org. If you have installed FFTW-3.3, ] - [make sure that this configure script can find it. See ./configure ] - [--help for more information.]) -fi - -if test "x$ax_lib_fftw3_mpi" = "xno"; then - AC_MSG_ERROR([You do not seem to have the MPI part of the FFTW-3.3 library installed.] - [You can download it from http://www.fftw.org. If you have installed FFTW-3.3, ] - [make sure that this configure script can find it. See ./configure --help] - [for more information.]) -fi - -if test "x$enable_openmp" = "xyes" -a "x$ax_lib_fftw3_openmp" = "xno"; then - AC_MSG_ERROR([You do not seem to have the OpenMP part of the FFTW-3.3 library installed.] - [You can download it from http://www.fftw.org. If you have installed FFTW-3.3, ] - [make sure that this configure script can find it. See ./configure --help] - [for more information.]) -fi - - ################################################################################ # compiler options ################################################################################ @@ -482,4 +452,6 @@ AC_CONFIG_LINKS([tests/build_checks.sh:tests/build_checks.sh tests/run_checks.sh:tests/run_checks.sh tests/manual_c2c_3d.c:tests/manual_c2c_3d.c]) +AC_CONFIG_SUBDIRS([fftw3]) + AC_OUTPUT diff --git a/fftw3 b/fftw3 new file mode 160000 index 0000000..7a893f4 --- /dev/null +++ b/fftw3 @@ -0,0 +1 @@ +Subproject commit 7a893f428f093c034457de69011d6acb39aafd6f diff --git a/gcell/Makefile.am b/gcell/Makefile.am index 9f28d98..a0f7bd9 100644 --- a/gcell/Makefile.am +++ b/gcell/Makefile.am @@ -6,6 +6,10 @@ AM_CPPFLAGS += -I$(top_srcdir)/api # Directory of util.h AM_CPPFLAGS += -I$(top_srcdir)/util +# +# Directory of fftw3 +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi noinst_LTLIBRARIES = libgcell.la diff --git a/kernel/Makefile.am b/kernel/Makefile.am index 9e3ad52..265dd46 100644 --- a/kernel/Makefile.am +++ b/kernel/Makefile.am @@ -6,6 +6,10 @@ AM_CPPFLAGS += -I$(top_srcdir)/api # Directory of util.h AM_CPPFLAGS += -I$(top_srcdir)/util +# +# Directory of fftw +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi noinst_LTLIBRARIES = libkernel.la @@ -21,6 +25,8 @@ libkernel_la_SOURCES = \ partrafo.c \ sertrafo.c \ procmesh.c \ + remap.c \ remap_3dto2d.c \ + remap_2dto1d.c \ timer.c \ transpose.c diff --git a/kernel/ipfft.h b/kernel/ipfft.h index 6d49503..a2ff314 100644 --- a/kernel/ipfft.h +++ b/kernel/ipfft.h @@ -213,7 +213,7 @@ typedef struct PX(timer_s) { double whole; double *trafo; double *remap; - double remap_3dto2d[2]; + double remap_nd[2]; double itwiddle; double otwiddle; } PX(timer_s); @@ -252,11 +252,12 @@ typedef sertrafo_dbg_s *sertrafo_dbg; /* a fftw plan */ typedef void (* PX(fftw_execute))(const X(plan) , R * in, R * out); typedef struct { - X(plan) plan; - R *planned_in; /* in and out array used at the time of plannning */ - R *planned_out; + X(plan) fftw; + R *in; /* in and out array used at the time of plannning */ + R *out; PX(fftw_execute) execute; } PX(fftw_plan); + /* this function is put in sertrafo.c, it should be in a different * file; as it is also used in transpose.c */ void PX(execute_fftw_plan)( @@ -348,8 +349,8 @@ typedef struct{ sertrafo_plan local_transp[2]; int q0; int q1; -} remap_3dto2d_plan_s; -typedef remap_3dto2d_plan_s *remap_3dto2d_plan; +} remap_nd_plan_s; +typedef remap_nd_plan_s *remap_nd_plan; /* plan for parallel trafo */ typedef struct PX(plan_s){ @@ -385,7 +386,7 @@ typedef struct PX(plan_s){ gtransp_plan *global_remap; outrafo_plan *serial_trafo; - remap_3dto2d_plan remap_3dto2d[2]; + remap_nd_plan remap_nd[2]; /* save data pointers to input and output for for complex conjugation in the cases * PFFT_FORWARD & PFFTI_TRAFO_C2R and PFFT_BACKWARD & PFFTI_TRAFO_R2C */ @@ -586,6 +587,82 @@ MPI_Comm PX(assure_cart_comm)( void PX(split_cart_procmesh)( MPI_Comm comm_cart, MPI_Comm *comms_1d); + +int PX(get_mpi_cart_coord_1d)(MPI_Comm comm_cart_1d, int *coord); +int PX(get_mpi_cart_coords)(MPI_Comm comm_cart, int maxdims, int *coords); +int PX(get_mpi_cart_dims)(MPI_Comm comm_cart, int maxdims, int *dims); + + +/* remap.c */ +int PX(needs_remap_nd)( + int rnk_n, MPI_Comm comm_cart); + +void PX(local_block_remap_nd_transposed)( + int rnk_n, const INT *n, + MPI_Comm comm_cart, int pid, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +int PX(local_size_remap_nd_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +remap_nd_plan PX(plan_remap_nd_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart, R *in, R *out, + unsigned transp_flag, unsigned trafo_flag, + unsigned opt_flag, unsigned io_flag, unsigned fftw_flags); +void PX(execute_remap_nd)( + remap_nd_plan ths, R *plannedin, R *plannedout, R *in, R *out); +void PX(remap_nd_rmplan)( + remap_nd_plan ths); + +void PX(remap_nd_split_cart_procmesh)( + const INT rnk_n, + MPI_Comm comm_cart, + MPI_Comm * comms_pm); + +void PX(remap_nd_get_coords)( + const INT rnk_n, + const INT pid, + MPI_Comm comm_cart, + int *np_pm, int *coords_pm); + +void PX(remap_nd_calculate_blocks)( + const INT rnk_n, + const INT *n, MPI_Comm comm_cart, + INT *iblk + ); + +/* remap_3dto2d.c */ + + +void PX(local_block_remap_3dto2d_transposed)( + int rnk_n, const INT *n, + MPI_Comm comm_cart_3d, int pid, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +int PX(local_size_remap_3dto2d_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart_3d, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); + +remap_nd_plan PX(plan_remap_3dto2d_transposed)( + remap_nd_plan ths, + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart_3d, R *in, R *out, + unsigned transp_flag, unsigned trafo_flag, + unsigned opt_flag, unsigned io_flag, unsigned fftw_flags); + +void PX(default_block_size_3dto2d)( + const INT *n, int p0, int p1, int q0, int q1, + INT *iblk, INT *mblk, INT *oblk); + void PX(coords_3dto2d)( int q0, int q1, const int *coords_3d, int *coords_2d); @@ -604,46 +681,51 @@ void PX(split_cart_procmesh_for_3dto2d_remap_q0)( void PX(split_cart_procmesh_for_3dto2d_remap_q1)( MPI_Comm comm_cart_3d, MPI_Comm *comms_1d); -int PX(get_mpi_cart_coord_1d)(MPI_Comm comm_cart_1d, int *coord); -int PX(get_mpi_cart_coords)(MPI_Comm comm_cart, int maxdims, int *coords); -int PX(get_mpi_cart_dims)(MPI_Comm comm_cart, int maxdims, int *dims); - -/* remap_3dto2d.c */ +/* remap_2dto1d.c */ -void PX(local_block_remap_3dto2d_transposed)( +void PX(local_block_remap_2dto1d_transposed)( int rnk_n, const INT *n, MPI_Comm comm_cart_3d, int pid, unsigned transp_flag, unsigned trafo_flag, INT *local_ni, INT *local_i_start, - INT *local_no, INT *local_o_start); -int PX(local_size_remap_3dto2d_transposed)( + INT *local_no, INT *local_o_start); +int PX(local_size_remap_2dto1d_transposed)( int rnk_n, const INT *n, INT howmany, MPI_Comm comm_cart_3d, unsigned transp_flag, unsigned trafo_flag, INT *local_ni, INT *local_i_start, INT *local_no, INT *local_o_start); -remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( + +remap_nd_plan PX(plan_remap_2dto1d_transposed)( + remap_nd_plan ths, int rnk_n, const INT *n, INT howmany, MPI_Comm comm_cart_3d, R *in, R *out, unsigned transp_flag, unsigned trafo_flag, unsigned opt_flag, unsigned io_flag, unsigned fftw_flags); -void PX(execute_remap_3dto2d)( - remap_3dto2d_plan ths, R *plannedin, R *plannedout, R *in, R *out); -void PX(remap_3dto2d_rmplan)( - remap_3dto2d_plan ths); -void PX(default_block_size_3dto2d)( - const INT *n, int p0, int p1, int q0, int q1, - INT *iblk, INT *mblk, INT *oblk); - - - - - - - +void PX(default_block_size_2dto1d)( + const INT *n, int p0, int q0, + INT *iblk, INT *oblk); +void PX(coords_2dto1d)( + int q0, const int *coords_2d, + int *coords_1d); +void PX(split_cart_procmesh_2dto1d_p0q0)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d); +void PX(split_cart_procmesh_2dto1d_p1q1)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d); +void PX(get_procmesh_dims_1d)( + MPI_Comm comm_cart_2d, + int *p0, int *q0); +void PX(split_cart_procmesh_for_2dto1d_remap_q0)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d); +void PX(split_cart_procmesh_for_2dto1d_remap_q1)( + MPI_Comm comm_cart_2d, + MPI_Comm *comms_1d); /* timer.c */ diff --git a/kernel/partrafo.c b/kernel/partrafo.c index 47a07e9..97cd111 100644 --- a/kernel/partrafo.c +++ b/kernel/partrafo.c @@ -151,26 +151,26 @@ void PX(local_block_partrafo)( &lni_to, &lis_to, &lno_to, &los_to, &lni_ti, &lis_ti, &lno_ti, &los_ti); - /* overwrite input blocks if remap_3dto2d is used */ + /* overwrite input blocks if remap_nd is used */ if( ~transp_flag & PFFT_TRANSPOSED_IN ){ PX(local_block_partrafo_transposed)( rnk_n, pni_to, pno_to, iblk, mblk, rnk_pm, coords_pm, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], lni_to, lis_to, lno_to, los_to); - PX(local_block_remap_3dto2d_transposed)( + PX(local_block_remap_nd_transposed)( rnk_n, pni_to, comm_cart, pid, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], local_ni, local_i_start, dummy_ln, dummy_ls); } - /* overwrite input blocks if remap_3dto2d is used */ + /* overwrite input blocks if remap_nd is used */ if( ~transp_flag & PFFT_TRANSPOSED_OUT ){ PX(local_block_partrafo_transposed)( rnk_n, pni_ti, pno_ti, mblk, oblk, rnk_pm, coords_pm, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], lni_ti, lis_ti, lno_ti, los_ti); - PX(local_block_remap_3dto2d_transposed)( + PX(local_block_remap_nd_transposed)( rnk_n, pno_ti, comm_cart, pid, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], dummy_ln, dummy_ls, local_no, local_o_start); } @@ -264,10 +264,12 @@ INT PX(local_size_partrafo)( lni_to, lis_to, lno_to, los_to); mem = MAX(mem, mem_tmp); - mem_tmp = PX(local_size_remap_3dto2d_transposed)( - rnk_n, pni_to, howmany, comm_cart, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], - lni_to, lis_to, dummy_ln, dummy_ls); - mem = MAX(mem, mem_tmp); + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + mem_tmp = PX(local_size_remap_nd_transposed)( + rnk_n, pni_to, howmany, comm_cart, PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], + lni_to, lis_to, dummy_ln, dummy_ls); + mem = MAX(mem, mem_tmp); + } } /* plan with transposed input */ @@ -278,10 +280,12 @@ INT PX(local_size_partrafo)( lni_ti, lis_ti, lno_ti, los_ti); mem = MAX(mem, mem_tmp); - mem_tmp = PX(local_size_remap_3dto2d_transposed)( - rnk_n, pno_ti, howmany, comm_cart, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], - dummy_ln, dummy_ls, lno_ti, los_ti); - mem = MAX(mem, mem_tmp); + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + mem_tmp = PX(local_size_remap_nd_transposed)( + rnk_n, pno_ti, howmany, comm_cart, PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], + dummy_ln, dummy_ls, lno_ti, los_ti); + mem = MAX(mem, mem_tmp); + } } if(pfft_flags & PFFT_SHIFTED_IN){ @@ -367,9 +371,9 @@ PX(plan) PX(plan_partrafo)( if(rnk_n < rnk_pm) return NULL; - /* equal dimension of FFT and procmesh only implemented for 3 dimensions */ + /* equal dimension of FFT and procmesh only implemented for 3 and 2dimensions */ if(rnk_n == rnk_pm) - if(rnk_n != 3) + if(rnk_n != 3 && rnk_n != 2) return NULL; malloc_and_split_cart_procmesh(rnk_n, transp_flag, comm_cart, @@ -458,17 +462,20 @@ PX(plan) PX(plan_partrafo)( /* plan with transposed output */ if(transp_flag & PFFT_TRANSPOSED_IN){ - ths->remap_3dto2d[0] = NULL; + ths->remap_nd[0] = NULL; set_plans_to_null(rnk_pm, PFFT_TRANSPOSED_OUT, ths->serial_trafo, ths->global_remap); } else { - ths->remap_3dto2d[0] = PX(plan_remap_3dto2d_transposed)( - rnk_n, pni_to, howmany, comm_cart, in, out, - PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], opt_flag, io_flag, fftw_flags); - - /* If remap_3dto2d exists, go on with in-place transforms in order to preserve input. */ - if( (ths->remap_3dto2d[0] != NULL) && (~io_flag & PFFT_DESTROY_INPUT) ) + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + ths->remap_nd[0] = PX(plan_remap_nd_transposed)( + rnk_n, pni_to, howmany, comm_cart, in, out, + PFFT_TRANSPOSED_OUT, trafo_flags_to[rnk_pm], opt_flag, io_flag, fftw_flags); + } else { + ths->remap_nd[0] = NULL; + } + /* If remap_nd exists, go on with in-place transforms in order to preserve input. */ + if( (ths->remap_nd[0] != NULL) && (~io_flag & PFFT_DESTROY_INPUT) ) in = out; PX(plan_partrafo_transposed)( @@ -486,7 +493,7 @@ PX(plan) PX(plan_partrafo)( if(transp_flag & PFFT_TRANSPOSED_OUT){ set_plans_to_null(rnk_pm, PFFT_TRANSPOSED_IN, ths->serial_trafo, ths->global_remap); - ths->remap_3dto2d[1] = NULL; + ths->remap_nd[1] = NULL; } else { PX(plan_partrafo_transposed)( rnk_n, pn_ti, pni_ti, pno_ti, howmany, mblk, oblk, @@ -498,9 +505,13 @@ PX(plan) PX(plan_partrafo)( if( ~io_flag & PFFT_DESTROY_INPUT ) in = out; - ths->remap_3dto2d[1] = PX(plan_remap_3dto2d_transposed)( - rnk_n, pno_ti, howmany, comm_cart, out, in, - PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], opt_flag, io_flag, fftw_flags); + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + ths->remap_nd[1] = PX(plan_remap_nd_transposed)( + rnk_n, pno_ti, howmany, comm_cart, out, in, + PFFT_TRANSPOSED_IN, trafo_flags_ti[rnk_pm], opt_flag, io_flag, fftw_flags); + } else { + ths->remap_nd[1] = NULL; + } } @@ -535,18 +546,16 @@ static void malloc_and_split_cart_procmesh( { MPI_Cartdim_get(comm_cart, rnk_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ) - *rnk_pm = 2; + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) + *rnk_pm = rnk_n - 1; *comms_pm = (MPI_Comm*) malloc(sizeof(MPI_Comm) * (size_t) *rnk_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ){ - PX(split_cart_procmesh_3dto2d_p0q0)(comm_cart, - *comms_pm + 0); - PX(split_cart_procmesh_3dto2d_p1q1)(comm_cart, - *comms_pm + 1); - } else + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) { + PX(remap_nd_split_cart_procmesh)(rnk_n, comm_cart, *comms_pm); + } else { PX(split_cart_procmesh)(comm_cart, *comms_pm); + } } static void malloc_and_compute_cart_np_and_coords( @@ -557,18 +566,14 @@ static void malloc_and_compute_cart_np_and_coords( { MPI_Cartdim_get(comm_cart, rnk_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ) - *rnk_pm = 2; + if( PX(needs_remap_nd)(rnk_n, comm_cart) ) + *rnk_pm = rnk_n - 1; *np_pm = PX(malloc_int)(*rnk_pm); *coords_pm = PX(malloc_int)(*rnk_pm); - if( PX(needs_3dto2d_remap)(rnk_n, comm_cart) ){ - int p0, p1, q0, q1, coords_3d[3]; - PX(get_procmesh_dims_2d)(comm_cart, &p0, &p1, &q0, &q1); - MPI_Cart_coords(comm_cart, pid, 3, coords_3d); - PX(coords_3dto2d)(q0, q1, coords_3d, *coords_pm); - (*np_pm)[0] = p0*q0; (*np_pm)[1] = p1*q1; + if( PX(needs_remap_nd)(rnk_n, comm_cart) ){ + PX(remap_nd_get_coords)(rnk_n, pid, comm_cart, *coords_pm, *np_pm); } else { int *periods = PX(malloc_int)(*rnk_pm); MPI_Cart_get(comm_cart, *rnk_pm, *np_pm, periods, *coords_pm); @@ -579,8 +584,6 @@ static void malloc_and_compute_cart_np_and_coords( - - static void save_param_into_plan( int rnk_n, const INT *n, const INT *ni, const INT *no, INT howmany, const INT *iblock, const INT *mblock, const INT *oblock, @@ -930,7 +933,7 @@ void PX(rmplan)( free(ths->skip_trafos); for(int t=0; t<2; t++) - PX(remap_3dto2d_rmplan)(ths->remap_3dto2d[t]); + PX(remap_nd_rmplan)(ths->remap_nd[t]); PX(destroy_timer)(ths->timer); diff --git a/kernel/procmesh.c b/kernel/procmesh.c index bb0d2bf..14660f3 100644 --- a/kernel/procmesh.c +++ b/kernel/procmesh.c @@ -25,13 +25,6 @@ static void extract_comm_1d( int dim, MPI_Comm comm_cart, MPI_Comm *comm_1d); -static void factorize( - int q, - int *ptr_q0, int *ptr_q1); -static void factorize_equal( - int p0, int p1, int q, - int *ptr_q0, int *ptr_q1); - int PX(create_procmesh)( int rnk, MPI_Comm comm, const int *np, @@ -188,205 +181,3 @@ int PX(get_mpi_cart_dims)(MPI_Comm comm_cart, int maxdims, int *dims) return ret; } -void PX(coords_3dto2d)( - int q0, int q1, const int *coords_3d, - int *coords_2d - ) -{ - coords_2d[0] = coords_3d[0]*q0 + coords_3d[2]/q1; - coords_2d[1] = coords_3d[1]*q1 + coords_3d[2]%q1; -} - -void PX(split_cart_procmesh_3dto2d_p0q0)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d - ) -{ - int p0, p1, q0=0, q1=0; - int ndims, coords_3d[3]; - int dim_1d, period_1d, reorder=0; - int color, key; - MPI_Comm comm; - - if( !PX(is_cart_procmesh)(comm_cart_3d) ) - return; - - MPI_Cartdim_get(comm_cart_3d, &ndims); - if(ndims != 3) - return; - - PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - - /* split into p1*q1 comms of size p0*q0 */ - key = coords_3d[0]*q0 + coords_3d[2]/q1; - color = coords_3d[1]*q1 + coords_3d[2]%q1; - MPI_Comm_split(comm_cart_3d, color, key, &comm); - - dim_1d = p0*q0; period_1d = 1; - MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, - comm_1d); - - MPI_Comm_free(&comm); -} - - -void PX(split_cart_procmesh_3dto2d_p1q1)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d - ) -{ - int p0, p1, q0=0, q1=0; - int ndims, coords_3d[3]; - int dim_1d, period_1d, reorder=0; - int color, key; - MPI_Comm comm; - - if( !PX(is_cart_procmesh)(comm_cart_3d) ) - return; - - MPI_Cartdim_get(comm_cart_3d, &ndims); - if(ndims != 3) - return; - - PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - - /* split into p0*q0 comms of size p1*q1 */ - color = coords_3d[0]*q0 + coords_3d[2]/q1; - key = coords_3d[1]*q1 + coords_3d[2]%q1; - MPI_Comm_split(comm_cart_3d, color, key, &comm); - - dim_1d = p1*q1; period_1d = 1; - MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, - comm_1d); - - MPI_Comm_free(&comm); -} - - -/* implement the splitting to create p0*p1*q0 comms of size q1 - * and p0*p1*q1 comms of size q0 */ -void PX(split_cart_procmesh_for_3dto2d_remap_q0)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d - ) -{ - int p0, p1, q0=0, q1=0; - int ndims, coords_3d[3]; - int dim_1d, period_1d, reorder=0; - int color, key; - MPI_Comm comm; - - if( !PX(is_cart_procmesh)(comm_cart_3d) ) - return; - - MPI_Cartdim_get(comm_cart_3d, &ndims); - if(ndims != 3) - return; - - PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - - /* split into p0*p1*q1 comms of size q0 */ - color = coords_3d[0]*p1*q1 + coords_3d[1]*q1 + coords_3d[2]%q1; - key = coords_3d[2]/q1; - MPI_Comm_split(comm_cart_3d, color, key, &comm); - - dim_1d = q0; period_1d = 1; - MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, - comm_1d); - - MPI_Comm_free(&comm); -} - - -void PX(split_cart_procmesh_for_3dto2d_remap_q1)( - MPI_Comm comm_cart_3d, - MPI_Comm *comm_1d - ) -{ - int p0, p1, q0=0, q1=0; - int ndims, coords_3d[3]; - int dim_1d, period_1d, reorder=0; - int color, key; - MPI_Comm comm; - - if( !PX(is_cart_procmesh)(comm_cart_3d) ) - return; - - MPI_Cartdim_get(comm_cart_3d, &ndims); - if(ndims != 3) - return; - - PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); - PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); - - /* split into p0*p1*q0 comms of size q1 */ - color = coords_3d[0]*p1*q0 + coords_3d[1]*q0 + coords_3d[2]/q1; - key = coords_3d[2]%q1; -// key = coords_3d[2]/q0; /* TODO: delete this line after several tests */ - MPI_Comm_split(comm_cart_3d, color, key, &comm); - - dim_1d = q1; period_1d = 1; - MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, - comm_1d); - - MPI_Comm_free(&comm); -} - -void PX(get_procmesh_dims_2d)( - MPI_Comm comm_cart_3d, - int *p0, int *p1, int *q0, int *q1 - ) -{ - int ndims=3, dims[3]; - - PX(get_mpi_cart_dims)(comm_cart_3d, ndims, dims); - *p0 = dims[0]; *p1 = dims[1]; -// factorize(dims[2], q0, q1); - factorize_equal(dims[0], dims[1], dims[2], q0, q1); -} - -/* factorize an integer q into q0*q1 with - * q1 <= q0 and q0-q1 -> min */ -static void factorize( - int q, - int *ptr_q0, int *ptr_q1 - ) -{ - for(int t = 1; t <= sqrt(q); t++) - if(t * (q/t) == q) - *ptr_q1 = t; - - *ptr_q0 = q / (*ptr_q1); -} - -/* factorize an integer q into q0*q1 with - * abs(p0*q0 - p1*q1) -> min */ -static void factorize_equal( - int p0, int p1, int q, - int *ptr_q0, int *ptr_q1 - ) -{ - int q0, q1; - int opt_q0 = 1; - int opt_q1 = q; - R min_err = pfft_fabs(p0 * q - p1 * 1.0); - - for(q1 = 1; q1 <= sqrt(q); q1++){ - q0 = q/q1; - if(q0*q1 == q){ - R err = pfft_fabs(p0*q0 - p1*q1); - if(err < min_err){ - min_err = err; - opt_q0 = q0; - opt_q1 = q1; - } - } - } - - *ptr_q0 = opt_q0; - *ptr_q1 = opt_q1; -} - diff --git a/kernel/remap.c b/kernel/remap.c new file mode 100644 index 0000000..9feb2de --- /dev/null +++ b/kernel/remap.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2011-2013 Michael Pippig + * + * This file is part of PFFT. + * + * PFFT is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * PFFT is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with PFFT. If not, see . + * + */ + + +#include "pfft.h" +#include "ipfft.h" +#include "util.h" + +/* Global infos about procmesh are only enabled in debug mode + * Otherwise we do not use any global variables. */ +#if PFFT_DEBUG_GVARS + extern MPI_Comm *gdbg_comm_cart; + extern int gdbg_rnk_pm; + extern MPI_Comm *gdbg_comms_pm; +#endif + +static remap_nd_plan remap_nd_mkplan(void); + +int PX(needs_remap_nd)( + int rnk_n, MPI_Comm comm_cart + ) +{ + int rnk_pm; + + MPI_Cartdim_get(comm_cart, &rnk_pm); + + return (rnk_n == rnk_pm); +} + +void PX(local_block_remap_nd_transposed)( + int rnk_n, const INT *n, + MPI_Comm comm_cart, int pid, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start) { + + if(rnk_n == 3) { + PX(local_block_remap_3dto2d_transposed)( + rnk_n, n, + comm_cart, pid, + transp_flag, trafo_flag, + local_ni, local_i_start, + local_no, local_o_start); + return; + } + if(rnk_n == 2) { + PX(local_block_remap_2dto1d_transposed)( + rnk_n, n, + comm_cart, pid, + transp_flag, trafo_flag, + local_ni, local_i_start, + local_no, local_o_start); + return; + } + abort(); +} + +int PX(local_size_remap_nd_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start) { + if(rnk_n == 3) { + return + PX(local_size_remap_3dto2d_transposed)( + rnk_n, n, howmany, + comm_cart, + transp_flag, trafo_flag, + local_ni, local_i_start, + local_no, local_o_start); + } + if(rnk_n == 2) { + return + PX(local_size_remap_2dto1d_transposed)( + rnk_n, n, howmany, + comm_cart, + transp_flag, trafo_flag, + local_ni, local_i_start, + local_no, local_o_start); + } + abort(); +} + +remap_nd_plan PX(plan_remap_nd_transposed)( + int rnk_n, const INT *n, INT howmany, + MPI_Comm comm_cart, R *in, R *out, + unsigned transp_flag, unsigned trafo_flag, + unsigned opt_flag, unsigned io_flag, unsigned fftw_flags) +{ + if(rnk_n == 3) { + remap_nd_plan ths = remap_nd_mkplan(); + + return + PX(plan_remap_3dto2d_transposed)( + ths, + rnk_n, n, howmany, + comm_cart, in, out, + transp_flag, trafo_flag, + opt_flag, io_flag, fftw_flags); + } + if(rnk_n == 2) { + remap_nd_plan ths = remap_nd_mkplan(); + + return + PX(plan_remap_2dto1d_transposed)( + ths, + rnk_n, n, howmany, + comm_cart, in, out, + transp_flag, trafo_flag, + opt_flag, io_flag, fftw_flags); + } + abort(); +} + +void PX(execute_remap_nd)( + remap_nd_plan ths, R *plannedin, R *plannedout, R *in, R *out) +{ + if(ths==NULL) + return; + +// #if PFFT_DEBUG_GVARS +// int np, myrank; +// int np0, np1, rnk0, rnk1; +// MPI_Comm_size(*gdbg_comm_cart, &np); +// MPI_Comm_rank(*gdbg_comm_cart, &myrank); +// MPI_Comm_size(gdbg_comms_pm[0], &np0); +// MPI_Comm_size(gdbg_comms_pm[1], &np1); +// MPI_Comm_rank(gdbg_comms_pm[0], &rnk0); +// MPI_Comm_rank(gdbg_comms_pm[1], &rnk1); +// +// int dims[3], periods[3], coords[3]; +// MPI_Cart_get(*gdbg_comm_cart, 3, +// dims, periods, coords); +// +// INT local_N[3], local_N_start[3]; +// +// int p0, p1, q0, q1; +// p0 = dims[0]; p1 = dims[1]; +// q0 = np0/p0; q1 = np1/p1; +// +// int lerr, m; +// #endif + + /* execute all initialized plans */ + if(ths->local_transp[0]) + PX(execute_sertrafo)(ths->local_transp[0], plannedin, plannedout, in, out); + +// #if PFFT_DEBUG_GVARS +// local_N[0] = 512/p0; local_N_start[0] = 0; +// local_N[1] = 512/p1; local_N_start[1] = 0; +// local_N[2] = 512/q0/q1; local_N_start[2] = 0; +// +// if(!myrank) fprintf(stderr, "!!! Before 1st remap: check all coefficients !!!\n"); +// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", +// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); +// +// lerr=0; +// m=0; +// for(INT k2=local_N_start[2]; k2global_remap[0]->dbg->in[m]; +// if( (data - ind) > 1e-13){ +// if(!lerr) +// if(!myrank) +// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); +// lerr = 1; +// } +// } +// #endif + + if(ths->global_remap[0]) + PX(execute_gtransp)(ths->global_remap[0], plannedin, plannedout, in, out); + +// #if PFFT_DEBUG_GVARS +// local_N[0] = 512/p0; local_N_start[0] = 0; +// local_N[1] = 512/p1/q1; local_N_start[1] = 0; +// local_N[2] = 512/q0; local_N_start[2] = 0; +// +// if(!myrank) fprintf(stderr, "!!! Before 2nd remap: check all coefficients !!!\n"); +// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", +// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); +// +// lerr=0; +// m=0; +// for(INT k2=local_N_start[2]; k2global_remap[1]->dbg->in[m]; +// if( (data - ind) > 1e-13){ +// if(!lerr) +// if(!myrank) +// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); +// lerr = 1; +// } +// } +// #endif + + if(ths->global_remap[1]) + PX(execute_gtransp)(ths->global_remap[1], plannedin, plannedout, in, out); + +// #if PFFT_DEBUG_GVARS +// local_N[0] = 512/p0/q0; local_N_start[0] = 0; +// local_N[1] = 512/p1/q1; local_N_start[1] = 0; +// local_N[2] = 512; local_N_start[2] = 0; +// +// if(!myrank) fprintf(stderr, "!!! After 2nd remap: check all coefficients !!!\n"); +// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", +// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); +// +// lerr=0; +// m=0; +// for(INT k2=local_N_start[2]; k2global_remap[1]->dbg->out[m]; +// if( (data - ind) > 1e-13){ +// if(!lerr) +// if(!myrank) +// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); +// lerr = 1; +// } +// } +// #endif + + if(ths->local_transp[1]) + PX(execute_sertrafo)(ths->local_transp[1], plannedin, plannedout, in, out); + +} + +static remap_nd_plan remap_nd_mkplan( + void + ) +{ + remap_nd_plan ths = (remap_nd_plan) malloc(sizeof(remap_nd_plan_s)); + + /* initialize to NULL for easy checks */ + for(int t=0; t<2; t++){ + ths->local_transp[t] = NULL; + ths->global_remap[t] = NULL; + } + + return ths; +} + + +void PX(remap_nd_rmplan)( + remap_nd_plan ths + ) +{ + /* plan was already destroyed or never initialized */ + if(ths==NULL) + return; + + for(int t=0; t<2; t++){ + PX(sertrafo_rmplan)(ths->local_transp[t]); + if (ths->global_remap[t]) + PX(gtransp_rmplan)(ths->global_remap[t]); + } + + /* free memory */ + free(ths); + /* ths=NULL; would be senseless, since we can not change the pointer itself */ +} + + +void PX(remap_nd_split_cart_procmesh)( + const INT rnk_n, + MPI_Comm comm_cart, + MPI_Comm * comms_pm) +{ + if(rnk_n == 3) { + PX(split_cart_procmesh_3dto2d_p0q0)(comm_cart, + comms_pm + 0); + PX(split_cart_procmesh_3dto2d_p1q1)(comm_cart, + comms_pm + 1); + return; + } + if(rnk_n == 2) { + PX(split_cart_procmesh_2dto1d_p0q0)(comm_cart, + comms_pm + 0); + return; + } + abort(); +} + +void PX(remap_nd_get_coords)( + const INT rnk_n, + const INT pid, + MPI_Comm comm_cart, + int *np_pm, int *coords_pm) +{ + if(rnk_n == 3) { + int p0, p1, q0, q1, coords_3d[3]; + PX(get_procmesh_dims_2d)(comm_cart, &p0, &p1, &q0, &q1); + MPI_Cart_coords(comm_cart, pid, 3, coords_3d); + PX(coords_3dto2d)(q0, q1, coords_3d, coords_pm); + (np_pm)[0] = p0*q0; (np_pm)[1] = p1*q1; + return; + } + if(rnk_n == 2) { + int p0, q0, coords_2d[3]; + PX(get_procmesh_dims_1d)(comm_cart, &p0, &q0); + MPI_Cart_coords(comm_cart, pid, 2, coords_2d); + PX(coords_2dto1d)(q0, coords_2d, coords_pm); + (np_pm)[0] = p0*q0; + return; + } + abort(); +} + +/* used in API as a short cut to pin the block size; notice that only iblk is needed. */ +void PX(remap_nd_calculate_blocks)( + const INT rnk_n, + const INT *n, MPI_Comm comm_cart, + INT *iblk + ) +{ + if(rnk_n == 3) { + int q0, q1, p0, p1; + INT mblk[3], oblk[3]; + + PX(get_procmesh_dims_2d)(comm_cart, &p0, &p1, &q0, &q1); + PX(default_block_size_3dto2d)(n, p0, p1, q0, q1, + iblk, mblk, oblk); + return; + } + if(rnk_n == 3) { + int q0, p0; + INT oblk[3]; + + PX(get_procmesh_dims_1d)(comm_cart, &p0, &q0); + PX(default_block_size_2dto1d)(n, p0, q0, + iblk, oblk); + return; + } + abort(); +} + diff --git a/kernel/remap_2dto1d.c b/kernel/remap_2dto1d.c new file mode 100644 index 0000000..19f507e --- /dev/null +++ b/kernel/remap_2dto1d.c @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2011-2013 Michael Pippig + * + * This file is part of PFFT. + * + * PFFT is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * PFFT is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with PFFT. If not, see . + * + */ + + +#include "pfft.h" +#include "ipfft.h" +#include "util.h" + +/* Global infos about procmesh are only enabled in debug mode + * Otherwise we do not use any global variables. */ +#if PFFT_DEBUG_GVARS + extern MPI_Comm *gdbg_comm_cart; + extern int gdbg_rnk_pm; + extern MPI_Comm *gdbg_comms_pm; +#endif + +/* TODO: think about order of in and out for T_IN */ +/* TODO: implement user blocksize */ + +static void init_blks_comms_local_size( + const INT *n, MPI_Comm comm_cart_2d, + INT *iblk, INT *oblk, + MPI_Comm *icomms, MPI_Comm *ocomms, + INT *local_ni, INT *local_no); + +static void split_comms_2dto1d( + MPI_Comm comm_cart_2d, + MPI_Comm *icomms, MPI_Comm *ocomms); +static void get_local_blocks_by_comms( + const INT *n, + const INT *iblks, const MPI_Comm *icomms, + const INT *oblks, const MPI_Comm *ocomms, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +static void get_local_n_2d_by_comms( + const INT *n, const INT *blks, const MPI_Comm *comms, + INT *local_n); +static void get_local_start_2d_by_comms( + const INT *n, const INT *blks, const MPI_Comm *comms, + INT *local_start); +static void get_local_blocks_by_coords( + const INT *n, + const INT *iblks, const int *icoords, + const INT *oblks, const int *ocoords, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start); +static void get_local_n_2d_by_coords( + const INT *n, const INT *blks, const int *coords, + INT *local_n); +static void get_local_start_2d_by_coords( + const INT *n, const INT *blks, const int *coords, + INT *local_start); + +void PX(local_block_remap_2dto1d_transposed)( + int rnk_n, const INT *pn, + MPI_Comm comm_cart_2d, int pid, + unsigned transp_flag, unsigned trafo_flag, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start + ) +{ + int p0, q0, rnk_pm; + int icoords[2], ocoords[2]; + INT iblks[2], oblks[2]; + + /* remap only works for 3d data on 3d procmesh */ + if(rnk_n != 2) return; + + MPI_Cartdim_get(comm_cart_2d, &rnk_pm); + if(rnk_pm != 2) return; + + /* Handle r2c input and c2r output like r2r. For complex data we use the C2C flag. */ + if(trafo_flag & PFFTI_TRAFO_RDFT) + trafo_flag = PFFTI_TRAFO_R2R; + + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); + PX(default_block_size_2dto1d)(pn, p0, q0, iblks, oblks); + + MPI_Cart_coords(comm_cart_2d, pid, 2, icoords); + PX(coords_2dto1d)(q0, icoords, ocoords); + ocoords[1] = 0; + + /* take care of transposed data ordering */ + if(transp_flag & PFFT_TRANSPOSED_OUT){ + get_local_blocks_by_coords(pn, iblks, icoords, oblks, ocoords, + local_ni, local_i_start, local_no, local_o_start); + } else { + get_local_blocks_by_coords(pn, iblks, icoords, oblks, ocoords, + local_no, local_o_start, local_ni, local_i_start); + } +} + +static void free_two_comms(MPI_Comm *comms) +{ + const int num_comms = 2; + for(int t=0; t n1/p1 x n0/p0 */ + nb = local_ni[0]; + nt = local_ni[1]; + + mem_tmp = PX(local_size_sertrafo)( + nb, 1, &nt, howmany, trafo_flag); + mem = MAX(mem, mem_tmp); + + /* N1/P x h1 x N0 x h0 -> N1 x h1 x N0/P x h0 + * with P = q0, N1 = n1, h1 = 1, N0 = n0/p0, h0 = 1 + * */ + /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) + * for each q0 ranks, a transpose of a matrix n1 x n0/p0, + * from divided along n1, to along n0/p0 + * */ + N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ + N1 = local_no[1]; h1 = 1; /* n1 */ + blk0 = oblk[0]; /* n0/(p0*q0) */ + blk1 = iblk[1]; /* n1 / q0 */ + hm = 1; /* set hm to 1 since mem will be in units of real/complex */ + + PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); + mem_tmp = PX(local_size_global_transp)( + N0, N1, h0, h1, hm, blk0, blk1, comm_q0); + mem = MAX(mem, mem_tmp); + MPI_Comm_free(&comm_q0); + + /* n1 x n0/(p0*q0) -> n0/(p0*q0) x n1 */ + nb = local_no[1]; + nt = local_no[0]; + + mem_tmp = PX(local_size_sertrafo)( + nb, 1, &nt, howmany, trafo_flag); + mem = MAX(mem, mem_tmp); + + /* take care of transposed data ordering */ + if(transp_flag & PFFT_TRANSPOSED_OUT){ + get_local_blocks_by_comms(pn, iblk, icomms, oblk, ocomms, + local_ni, local_i_start, local_no, local_o_start); + } else { + get_local_blocks_by_comms(pn, iblk, icomms, oblk, ocomms, + local_no, local_o_start, local_ni, local_i_start); + } + + /* free communicators */ + free_two_comms(icomms); + free_two_comms(ocomms); + + return mem; +} + + +/* ouput is written to 'in', also for outofplace */ +remap_nd_plan PX(plan_remap_2dto1d_transposed)( + remap_nd_plan ths, + int rnk_n, const INT *pn, INT howmany, + MPI_Comm comm_cart_2d, R *in_user, R *out_user, + unsigned transp_flag, unsigned trafo_flag, + unsigned opt_flag, unsigned io_flag, unsigned fftw_flags + ) +{ + int p0, p1, rnk_pm; + INT nb, nt, N0, N1, h0, h1, hm, blk0, blk1; + INT local_ni[2], local_no[2]; + INT iblk[2],oblk[2]; + MPI_Comm icomms[2], ocomms[2]; + MPI_Comm comm_q0; + R *in=in_user, *out=out_user; + + /* remap only works for 2d data on 2d procmesh */ + if(rnk_n != 2) + return NULL; + + MPI_Cartdim_get(comm_cart_2d, &rnk_pm); + if(rnk_pm != 2) + return NULL; + + /* Handle r2c input and c2r output like r2r. For complex data we use the C2C flag. */ + if(trafo_flag & PFFTI_TRAFO_RDFT) + trafo_flag = PFFTI_TRAFO_R2R; + + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &ths->q0); + init_blks_comms_local_size(pn, comm_cart_2d, + iblk, oblk, icomms, ocomms, + local_ni, local_no); + + ths->local_transp[0] = NULL; + ths->local_transp[1] = NULL; + ths->global_remap[0] = NULL; + ths->global_remap[1] = NULL; + + /* plan TRANSPOSED_IN in opposite direction */ + + /* p1 == q0 */ + + /* n0/p0 x n1/p1 -> n1/q0 x n0/p0 */ + + /* n1/q0 x n0/p0 -> n1 x n0/(p0*q0) */ + + + N0 = local_ni[0]; h0 = 1; /* n0 / p0 */ + N1 = local_no[1]; h1 = 1; /* n1 */ + blk0 = oblk[0]; /* n0/(p0*q0) */ + blk1 = iblk[1]; /* n1 / q0 */ + hm = howmany * (trafo_flag & PFFTI_TRAFO_C2C ? 2 : 1); + + PX(split_cart_procmesh_for_2dto1d_remap_q0)(comm_cart_2d, &comm_q0); + + /* The break up into transformations here is likely suboptimal + * but at least it gives correct result. + * + * The tricky fact is that depending on DESTROY_INPUT the result + * of this routine shall be either in 'in' or in 'out'. + * */ + if(transp_flag & PFFT_TRANSPOSED_IN) { + R * tmp; + + if(~io_flag & PFFT_DESTROY_INPUT) { + /* preserve input means the result is written in the output */ + nb = local_no[1]; + nt = local_no[0]; + + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); + + nb = local_ni[0]; + nt = local_ni[1]; + + ths->global_remap[0] = PX(plan_global_transp)( + N1, N0, h1, h0, hm, blk1, blk0, + comm_q0, out, out, PFFT_TRANSPOSED_OUT, fftw_flags); + + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } else { + /* destroy input means the result is written in the input */ + nb = local_no[1]; + nt = local_no[0]; + + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); + + nb = local_ni[0]; + nt = local_ni[1]; + + /* keep the global out of place to make it 'faster'? */ + ths->global_remap[0] = PX(plan_global_transp)( + N1, N0, h1, h0, hm, blk1, blk0, + comm_q0, out, in, PFFT_TRANSPOSED_OUT, fftw_flags); + + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, in, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } + + } else { + if(~io_flag & PFFT_DESTROY_INPUT) { + /* preserve input means the result is written in the output */ + nb = local_ni[0]; + nt = local_ni[1]; + + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags | FFTW_PRESERVE_INPUT); + + ths->global_remap[1] = PX(plan_global_transp)( + N0, N1, h0, h1, hm, blk0, blk1, + comm_q0, out, out, PFFT_TRANSPOSED_IN, fftw_flags); + + nb = local_no[1]; + nt = local_no[0]; + + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, out, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } else { + /* destroy input means the result is written in the input */ + nb = local_ni[0]; + nt = local_ni[1]; + + ths->local_transp[0] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, out, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + + ths->global_remap[1] = PX(plan_global_transp)( + N0, N1, h0, h1, hm, blk0, blk1, + comm_q0, out, in, PFFT_TRANSPOSED_IN, fftw_flags); + + nb = local_no[1]; + nt = local_no[0]; + ths->local_transp[1] = PX(plan_sertrafo)( + nb, 1, &nt, howmany, in, in, 0, NULL, + trafo_flag| PFFTI_TRAFO_SKIP, transp_flag, 0, + opt_flag, fftw_flags); + } + } + MPI_Comm_free(&comm_q0); + + /* free communicators */ + free_two_comms(icomms); + free_two_comms(ocomms); + + return ths; +} + +static void init_blks_comms_local_size( + const INT *n, MPI_Comm comm_cart_2d, + INT *iblk, INT *oblk, + MPI_Comm *icomms, MPI_Comm *ocomms, + INT *local_ni, INT *local_no + ) +{ + int p0, q0; + + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); + + PX(default_block_size_2dto1d)(n, p0, q0, + iblk, oblk); + split_comms_2dto1d(comm_cart_2d, + icomms, ocomms); + get_local_n_2d_by_comms(n, iblk, icomms, + local_ni); + get_local_n_2d_by_comms(n, oblk, ocomms, + local_no); +} + +static void get_local_blocks_by_comms( + const INT *n, + const INT *iblks, const MPI_Comm *icomms, + const INT *oblks, const MPI_Comm *ocomms, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start + ) +{ + get_local_n_2d_by_comms(n, iblks, icomms, local_ni); + get_local_start_2d_by_comms(n, iblks, icomms, local_i_start); + get_local_n_2d_by_comms(n, oblks, ocomms, local_no); + get_local_start_2d_by_comms(n, oblks, ocomms, local_o_start); +} + +static void get_local_n_2d_by_comms( + const INT *n, const INT *blks, const MPI_Comm *comms, + INT *local_n + ) +{ + int coord; + + for(int t=0; t<2; t++){ + PX(get_mpi_cart_coord_1d)(comms[t], &coord); + local_n[t] = PX(local_block_size)(n[t], blks[t], coord); + } +} + +static void get_local_start_2d_by_comms( + const INT *n, const INT *blks, const MPI_Comm *comms, + INT *local_start + ) +{ + int coord; + + for(int t=0; t<2; t++){ + PX(get_mpi_cart_coord_1d)(comms[t], &coord); + local_start[t] = PX(local_block_offset)(n[t], blks[t], coord); + } +} + +static void get_local_blocks_by_coords( + const INT *n, + const INT *iblks, const int *icoords, + const INT *oblks, const int *ocoords, + INT *local_ni, INT *local_i_start, + INT *local_no, INT *local_o_start + ) +{ + get_local_n_2d_by_coords(n, iblks, icoords, local_ni); + get_local_start_2d_by_coords(n, iblks, icoords, local_i_start); + get_local_n_2d_by_coords(n, oblks, ocoords, local_no); + get_local_start_2d_by_coords(n, oblks, ocoords, local_o_start); +} + +static void get_local_n_2d_by_coords( + const INT *n, const INT *blks, const int *coords, + INT *local_n + ) +{ + for(int t=0; t<2; t++) + local_n[t] = PX(local_block_size)(n[t], blks[t], coords[t]); +} + +static void get_local_start_2d_by_coords( + const INT *n, const INT *blks, const int *coords, + INT *local_start + ) +{ + for(int t=0; t<2; t++) + local_start[t] = PX(local_block_offset)(n[t], blks[t], coords[t]); +} + + + +void PX(default_block_size_2dto1d)( + const INT *n, int p0, int q0, + INT *iblk, INT *oblk + ) +{ + /* n0/(p0*q0) x n1 */ + oblk[0] = PX(global_block_size)(n[0], PFFT_DEFAULT_BLOCK, p0*q0); + oblk[1] = n[1]; + + /* n0/p0 x n1/q0 */ + iblk[0] = oblk[0]*q0; + iblk[1] = PX(global_block_size)(n[1], PFFT_DEFAULT_BLOCK, q0); +} + + +/* allocate array of length 2 for communicators */ +static void split_comms_2dto1d( + MPI_Comm comm_cart_2d, + MPI_Comm *icomms, MPI_Comm *ocomms + ) +{ + int ndims=1, dim_1d, period_1d, reorder=0; + + /* n0/p0 x n1/p1 */ + PX(split_cart_procmesh)(comm_cart_2d, icomms); + + /* n0/(p0 * q0) x n1 */ + PX(split_cart_procmesh_2dto1d_p0q0)(comm_cart_2d, &ocomms[0]); + dim_1d=1; period_1d=1; + MPI_Cart_create(MPI_COMM_SELF, ndims, &dim_1d, &period_1d, reorder, + &ocomms[1]); + +} + +void PX(coords_2dto1d)( + int q0, const int *coords_2d, + int *coords_1d + ) +{ + coords_1d[0] = coords_2d[0]*q0 + coords_2d[1]; +} + +void PX(split_cart_procmesh_2dto1d_p0q0)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d + ) +{ + int p0, q0=0; + int ndims, coords_2d[2]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + + if( !PX(is_cart_procmesh)(comm_cart_2d) ) + return; + + MPI_Cartdim_get(comm_cart_2d, &ndims); + if(ndims != 2) + return; + + PX(get_mpi_cart_coords)(comm_cart_2d, ndims, coords_2d); + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); + + /* split into 1 comms of size p0*q0 */ + key = coords_2d[0]*q0 + coords_2d[1]; + color = 0; + MPI_Comm_split(comm_cart_2d, color, key, &comm); + + dim_1d = p0*q0; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); +} + + +/* implement the splitting to create p0 comms of size q0 */ +void PX(split_cart_procmesh_for_2dto1d_remap_q0)( + MPI_Comm comm_cart_2d, + MPI_Comm *comm_1d + ) +{ + int p0, q0=0; + int ndims, coords_2d[2]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + + if( !PX(is_cart_procmesh)(comm_cart_2d) ) + return; + + MPI_Cartdim_get(comm_cart_2d, &ndims); + if(ndims != 2) + return; + + PX(get_mpi_cart_coords)(comm_cart_2d, ndims, coords_2d); + PX(get_procmesh_dims_1d)(comm_cart_2d, &p0, &q0); + + /* split into p0 comms of size q0 */ + color = coords_2d[0]; + key = coords_2d[1]; + MPI_Comm_split(comm_cart_2d, color, key, &comm); + + dim_1d = q0; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); +} + +void PX(get_procmesh_dims_1d)( + MPI_Comm comm_cart_2d, + int *p0, int *q0 + ) +{ + int ndims=2, dims[2]; + + PX(get_mpi_cart_dims)(comm_cart_2d, ndims, dims); + *p0 = dims[0]; + *q0 = dims[1]; +} + + + + + + diff --git a/kernel/remap_3dto2d.c b/kernel/remap_3dto2d.c index c6607e4..1f47794 100644 --- a/kernel/remap_3dto2d.c +++ b/kernel/remap_3dto2d.c @@ -23,14 +23,31 @@ #include "ipfft.h" #include "util.h" +/* + * useful table of i, o, and m. p = q0*q1 + * + * i : n0 / p0 x n1 / p1 x n2 / p2 + * m : n0 / p0 x n1 / (p1 * q1) x n2 / q0 + * o : n0 / (p0 * q0) x n1 / (p1 * q1) x n2 / 1 + * + * */ + /* Global infos about procmesh are only enabled in debug mode * Otherwise we do not use any global variables. */ + #if PFFT_DEBUG_GVARS extern MPI_Comm *gdbg_comm_cart; extern int gdbg_rnk_pm; extern MPI_Comm *gdbg_comms_pm; #endif +static void factorize( + int q, + int *ptr_q0, int *ptr_q1); +static void factorize_equal( + int p0, int p1, int q, + int *ptr_q0, int *ptr_q1); + /* TODO: think about order of in and out for T_IN */ /* TODO: implement user blocksize */ @@ -68,10 +85,6 @@ static void get_local_start_3d_by_coords( const INT *n, const INT *blks, const int *coords, INT *local_start); -static remap_3dto2d_plan remap_3dto2d_mkplan( - void); - - void PX(local_block_remap_3dto2d_transposed)( int rnk_n, const INT *pn, MPI_Comm comm_cart_3d, int pid, @@ -216,7 +229,8 @@ int PX(local_size_remap_3dto2d_transposed)( /* ouput is written to 'in', also for outofplace */ -remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( +remap_nd_plan PX(plan_remap_3dto2d_transposed)( + remap_nd_plan ths, int rnk_n, const INT *pn, INT howmany, MPI_Comm comm_cart_3d, R *in_user, R *out_user, unsigned transp_flag, unsigned trafo_flag, @@ -230,7 +244,6 @@ remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( MPI_Comm icomms[3], mcomms[3], ocomms[3]; MPI_Comm comm_q0, comm_q1; R *in=in_user, *out=out_user; - remap_3dto2d_plan ths; /* remap only works for 3d data on 3d procmesh */ if(rnk_n != 3) @@ -240,8 +253,6 @@ remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( if(rnk_pm != 3) return NULL; - ths = remap_3dto2d_mkplan(); - /* Handle r2c input and c2r output like r2r. For complex data we use the C2C flag. */ if(trafo_flag & PFFTI_TRAFO_RDFT) trafo_flag = PFFTI_TRAFO_R2R; @@ -273,6 +284,15 @@ remap_3dto2d_plan PX(plan_remap_3dto2d_transposed)( } /* n2/(q0*q1) x n0/p0 x n1/p1 -> n2/q0 x n0/p0 x n1/(p1*q1) */ + /* for each q1 ranks, we are looking at a transpose of + * local_ni[1] x (local_nm[2] x local_ni[0]), + * The intial partition is along (local_nm[2] x local_ni[0]), + * by size iblk[2] x local_ni[0]. + * The final partition is along local_ni[1], by size + * mblk[1]. + * + * The math works out by referring to the table at the beginning of the code. + * */ N0 = local_ni[1]; h0 = 1; N1 = local_nm[2]; h1 = local_ni[0]; blk0 = mblk[1]; @@ -481,165 +501,206 @@ static void split_comms_3dto2d( PX(split_cart_procmesh_for_3dto2d_remap_q0)(comm_cart_3d, &mcomms[2]); } +void PX(coords_3dto2d)( + int q0, int q1, const int *coords_3d, + int *coords_2d + ) +{ + coords_2d[0] = coords_3d[0]*q0 + coords_3d[2]/q1; + coords_2d[1] = coords_3d[1]*q1 + coords_3d[2]%q1; +} +void PX(split_cart_procmesh_3dto2d_p0q0)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d + ) +{ + int p0, p1, q0=0, q1=0; + int ndims, coords_3d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + if( !PX(is_cart_procmesh)(comm_cart_3d) ) + return; + MPI_Cartdim_get(comm_cart_3d, &ndims); + if(ndims != 3) + return; + PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); + PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); + /* split into p1*q1 comms of size p0*q0 */ + key = coords_3d[0]*q0 + coords_3d[2]/q1; + color = coords_3d[1]*q1 + coords_3d[2]%q1; + MPI_Comm_split(comm_cart_3d, color, key, &comm); + dim_1d = p0*q0; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + MPI_Comm_free(&comm); +} -void PX(execute_remap_3dto2d)( - remap_3dto2d_plan ths, R * plannedin, R * plannedout, R * in, R * out +void PX(split_cart_procmesh_3dto2d_p1q1)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d ) { - if(ths==NULL) + int p0, p1, q0=0, q1=0; + int ndims, coords_3d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + + if( !PX(is_cart_procmesh)(comm_cart_3d) ) return; -// #if PFFT_DEBUG_GVARS -// int np, myrank; -// int np0, np1, rnk0, rnk1; -// MPI_Comm_size(*gdbg_comm_cart, &np); -// MPI_Comm_rank(*gdbg_comm_cart, &myrank); -// MPI_Comm_size(gdbg_comms_pm[0], &np0); -// MPI_Comm_size(gdbg_comms_pm[1], &np1); -// MPI_Comm_rank(gdbg_comms_pm[0], &rnk0); -// MPI_Comm_rank(gdbg_comms_pm[1], &rnk1); -// -// int dims[3], periods[3], coords[3]; -// MPI_Cart_get(*gdbg_comm_cart, 3, -// dims, periods, coords); -// -// INT local_N[3], local_N_start[3]; -// -// int p0, p1, q0, q1; -// p0 = dims[0]; p1 = dims[1]; -// q0 = np0/p0; q1 = np1/p1; -// -// int lerr, m; -// #endif - - /* execute all initialized plans */ - PX(execute_sertrafo)(ths->local_transp[0], plannedin, plannedout, in, out); - -// #if PFFT_DEBUG_GVARS -// local_N[0] = 512/p0; local_N_start[0] = 0; -// local_N[1] = 512/p1; local_N_start[1] = 0; -// local_N[2] = 512/q0/q1; local_N_start[2] = 0; -// -// if(!myrank) fprintf(stderr, "!!! Before 1st remap: check all coefficients !!!\n"); -// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", -// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); -// -// lerr=0; -// m=0; -// for(INT k2=local_N_start[2]; k2global_remap[0]->dbg->in[m]; -// if( (data - ind) > 1e-13){ -// if(!lerr) -// if(!myrank) -// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); -// lerr = 1; -// } -// } -// #endif - - PX(execute_gtransp)(ths->global_remap[0], plannedin, plannedout, in, out); - -// #if PFFT_DEBUG_GVARS -// local_N[0] = 512/p0; local_N_start[0] = 0; -// local_N[1] = 512/p1/q1; local_N_start[1] = 0; -// local_N[2] = 512/q0; local_N_start[2] = 0; -// -// if(!myrank) fprintf(stderr, "!!! Before 2nd remap: check all coefficients !!!\n"); -// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", -// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); -// -// lerr=0; -// m=0; -// for(INT k2=local_N_start[2]; k2global_remap[1]->dbg->in[m]; -// if( (data - ind) > 1e-13){ -// if(!lerr) -// if(!myrank) -// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); -// lerr = 1; -// } -// } -// #endif - - PX(execute_gtransp)(ths->global_remap[1], plannedin, plannedout, in, out); - -// #if PFFT_DEBUG_GVARS -// local_N[0] = 512/p0/q0; local_N_start[0] = 0; -// local_N[1] = 512/p1/q1; local_N_start[1] = 0; -// local_N[2] = 512; local_N_start[2] = 0; -// -// if(!myrank) fprintf(stderr, "!!! After 2nd remap: check all coefficients !!!\n"); -// if(!myrank) fprintf(stderr, "!!! local_N=[%td, %td, %td], local_N_start = [%td, %td, %td]\n", -// local_N[0], local_N[1], local_N[2], local_N_start[0], local_N_start[1], local_N_start[2]); -// -// lerr=0; -// m=0; -// for(INT k2=local_N_start[2]; k2global_remap[1]->dbg->out[m]; -// if( (data - ind) > 1e-13){ -// if(!lerr) -// if(!myrank) -// fprintf(stderr, "data[%td] = %e, ind = %e, k0=%td, k1=%td, k2=%td\n", data, m, ind, k0, k1, k2); -// lerr = 1; -// } -// } -// #endif - - PX(execute_sertrafo)(ths->local_transp[1], plannedin, plannedout, in, out); + MPI_Cartdim_get(comm_cart_3d, &ndims); + if(ndims != 3) + return; + + PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); + PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); + + /* split into p0*q0 comms of size p1*q1 */ + color = coords_3d[0]*q0 + coords_3d[2]/q1; + key = coords_3d[1]*q1 + coords_3d[2]%q1; + MPI_Comm_split(comm_cart_3d, color, key, &comm); + + dim_1d = p1*q1; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); } -static remap_3dto2d_plan remap_3dto2d_mkplan( - void + +/* implement the splitting to create p0*p1*q0 comms of size q1 + * and p0*p1*q1 comms of size q0 */ +void PX(split_cart_procmesh_for_3dto2d_remap_q0)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d ) { - remap_3dto2d_plan ths = (remap_3dto2d_plan) malloc(sizeof(remap_3dto2d_plan_s)); + int p0, p1, q0=0, q1=0; + int ndims, coords_3d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; - /* initialize to NULL for easy checks */ - for(int t=0; t<2; t++){ - ths->local_transp[t] = NULL; - ths->global_remap[t] = NULL; - } - - return ths; + if( !PX(is_cart_procmesh)(comm_cart_3d) ) + return; + + MPI_Cartdim_get(comm_cart_3d, &ndims); + if(ndims != 3) + return; + + PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); + PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); + + /* split into p0*p1*q1 comms of size q0 */ + color = coords_3d[0]*p1*q1 + coords_3d[1]*q1 + coords_3d[2]%q1; + key = coords_3d[2]/q1; + MPI_Comm_split(comm_cart_3d, color, key, &comm); + + dim_1d = q0; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); } -void PX(remap_3dto2d_rmplan)( - remap_3dto2d_plan ths +void PX(split_cart_procmesh_for_3dto2d_remap_q1)( + MPI_Comm comm_cart_3d, + MPI_Comm *comm_1d ) { - /* plan was already destroyed or never initialized */ - if(ths==NULL) + int p0, p1, q0=0, q1=0; + int ndims, coords_3d[3]; + int dim_1d, period_1d, reorder=0; + int color, key; + MPI_Comm comm; + + if( !PX(is_cart_procmesh)(comm_cart_3d) ) + return; + + MPI_Cartdim_get(comm_cart_3d, &ndims); + if(ndims != 3) return; - for(int t=0; t<2; t++){ - PX(sertrafo_rmplan)(ths->local_transp[t]); - PX(gtransp_rmplan)(ths->global_remap[t]); + PX(get_mpi_cart_coords)(comm_cart_3d, ndims, coords_3d); + PX(get_procmesh_dims_2d)(comm_cart_3d, &p0, &p1, &q0, &q1); + + /* split into p0*p1*q0 comms of size q1 */ + color = coords_3d[0]*p1*q0 + coords_3d[1]*q0 + coords_3d[2]/q1; + key = coords_3d[2]%q1; +// key = coords_3d[2]/q0; /* TODO: delete this line after several tests */ + MPI_Comm_split(comm_cart_3d, color, key, &comm); + + dim_1d = q1; period_1d = 1; + MPI_Cart_create(comm, ndims=1, &dim_1d, &period_1d, reorder, + comm_1d); + + MPI_Comm_free(&comm); +} + +void PX(get_procmesh_dims_2d)( + MPI_Comm comm_cart_3d, + int *p0, int *p1, int *q0, int *q1 + ) +{ + int ndims=3, dims[3]; + + PX(get_mpi_cart_dims)(comm_cart_3d, ndims, dims); + *p0 = dims[0]; *p1 = dims[1]; +// factorize(dims[2], q0, q1); + factorize_equal(dims[0], dims[1], dims[2], q0, q1); +} + +/* factorize an integer q into q0*q1 with + * q1 <= q0 and q0-q1 -> min */ +static void factorize( + int q, + int *ptr_q0, int *ptr_q1 + ) +{ + for(int t = 1; t <= sqrt(q); t++) + if(t * (q/t) == q) + *ptr_q1 = t; + + *ptr_q0 = q / (*ptr_q1); +} + +/* factorize an integer q into q0*q1 with + * abs(p0*q0 - p1*q1) -> min */ +static void factorize_equal( + int p0, int p1, int q, + int *ptr_q0, int *ptr_q1 + ) +{ + int q0, q1; + int opt_q0 = 1; + int opt_q1 = q; + R min_err = pfft_fabs(p0 * q - p1 * 1.0); + + for(q1 = 1; q1 <= sqrt(q); q1++){ + q0 = q/q1; + if(q0*q1 == q){ + R err = pfft_fabs(p0*q0 - p1*q1); + if(err < min_err){ + min_err = err; + opt_q0 = q0; + opt_q1 = q1; + } + } } - /* free memory */ - free(ths); - /* ths=NULL; would be senseless, since we can not change the pointer itself */ + *ptr_q0 = opt_q0; + *ptr_q1 = opt_q1; } diff --git a/kernel/sertrafo.c b/kernel/sertrafo.c index 4dadde8..1f7b616 100644 --- a/kernel/sertrafo.c +++ b/kernel/sertrafo.c @@ -313,7 +313,7 @@ static sertrafo_plan plan_remap_only( plan_remap(&ths->plan[0], nb, rnk, n, howmany, in, out, trafo_flag, transp_flag, fftw_flags, PFFTI_ARRAY_INPUT); #endif - ths->plan[1].plan = NULL; + ths->plan[1].fftw = NULL; return ths; } @@ -443,7 +443,7 @@ static sertrafo_plan plan_sertrafo_pt( /* only measure times if we need them for comparison to plan1 */ if(opt_flag & PFFT_TUNE) - time0 = measure_time(ths0->plan[0].plan) + measure_time(ths0->plan[1].plan); + time0 = measure_time(ths0->plan[0].fftw) + measure_time(ths0->plan[1].fftw); /* this plan differs for altered strides or out-of-place */ ths1 = sertrafo_mkplan(); @@ -470,7 +470,7 @@ static sertrafo_plan plan_sertrafo_pt( nb, rnk, n, howmany, in1, out1, sign, kinds, trafo_flag, transp_flag1, fftw_flags PFFT_DEBUG_SERTRAFO_PTR11); } - time1 = measure_time(ths1->plan[0].plan) + measure_time(ths1->plan[1].plan); + time1 = measure_time(ths1->plan[0].fftw) + measure_time(ths1->plan[1].fftw); } } @@ -501,12 +501,12 @@ static void plan_trafo( /* R2C can not combine trafo and transposition */ if( (trafo_flag & PFFTI_TRAFO_RDFT) && needs_transpose(transp_flag) ) { - plan->plan = NULL; + plan->fftw = NULL; return; } /* R2R can not combine trafo and transposition */ if( (trafo_flag & PFFTI_TRAFO_R2R) && needs_transpose(transp_flag) ) { - plan->plan = NULL; + plan->fftw = NULL; return; } malloc_and_fill_dims_trafo( @@ -515,32 +515,32 @@ static void plan_trafo( /* choose appropriate fftw planner for trafo */ if(trafo_flag & PFFTI_TRAFO_R2C){ - plan->plan = X(plan_guru64_dft_r2c)( + plan->fftw = X(plan_guru64_dft_r2c)( dims_rnk, dims, howmany_rnk, howmany_dims, in, (C*) out, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_dft_r2c); } else if(trafo_flag & PFFTI_TRAFO_C2R){ - plan->plan = X(plan_guru64_dft_c2r)( + plan->fftw = X(plan_guru64_dft_c2r)( dims_rnk, dims, howmany_rnk, howmany_dims, (C*) in, out, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_dft_c2r); } else if(trafo_flag & PFFTI_TRAFO_R2R){ - plan->plan = X(plan_guru64_r2r)( + plan->fftw = X(plan_guru64_r2r)( dims_rnk, dims, howmany_rnk, howmany_dims, in, out, kinds, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_r2r); } else { - plan->plan = X(plan_guru64_dft)( + plan->fftw = X(plan_guru64_dft)( dims_rnk, dims, howmany_rnk, howmany_dims, (C*) in, (C*) out, sign, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_dft); } @@ -622,18 +622,18 @@ static void plan_remap( /* choose appropriate fftw planner for remap */ if(trafo_flag & PFFTI_TRAFO_C2C) { - plan->plan = X(plan_guru64_dft)( + plan->fftw = X(plan_guru64_dft)( dims_rnk, dims, howmany_rnk, howmany_dims, (C*) in, (C*) out, sign, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_dft); } else { - plan->plan = X(plan_guru64_r2r)( + plan->fftw = X(plan_guru64_r2r)( dims_rnk, dims, howmany_rnk, howmany_dims, in, out, kinds, fftw_flags); - plan->planned_in = in; - plan->planned_out = out; + plan->in = in; + plan->out = out; plan->execute = (PX(fftw_execute)) X(execute_r2r); } #if PFFT_DEBUG_SERTRAFO @@ -857,7 +857,7 @@ static sertrafo_plan sertrafo_mkplan( /* initialize to NULL for easy checks */ for(int t=0; t<2; t++) - ths->plan[t].plan = NULL; + ths->plan[t].fftw= NULL; /* initialize debug info */ #if PFFT_DEBUG_SERTRAFO @@ -878,8 +878,8 @@ void PX(sertrafo_rmplan)( /* take care of unsuccessful FFTW planing */ for(int t=0; t<2; t++) - if(ths->plan[t].plan != NULL) - X(destroy_plan)(ths->plan[t].plan); + if(ths->plan[t].fftw != NULL) + X(destroy_plan)(ths->plan[t].fftw); #if PFFT_DEBUG_SERTRAFO for(int t=0; t<2; t++) @@ -1064,10 +1064,17 @@ void PX(execute_fftw_plan)( R *executed_in, R *executed_out) { - R *in = (fftwplan->planned_in == planned_in) ? executed_in : executed_out; - R *out = (fftwplan->planned_out == planned_out) ? executed_out : executed_in; + R *in = NULL; // = (fftwplan->planned_in == planned_in) ? executed_in : executed_out; + R *out = NULL; //= (fftwplan->planned_out == planned_out) ? executed_out : executed_in; - (fftwplan->execute)(fftwplan->plan, in, out); + if(fftwplan->in == planned_in) in = executed_in; + if(fftwplan->in == planned_out) in = executed_out; + if(fftwplan->out == planned_in) out = executed_in; + if(fftwplan->out == planned_out) out = executed_out; + + if(in == NULL) abort(); + if(out == NULL) abort(); + (fftwplan->execute)(fftwplan->fftw, in, out); } void PX(execute_sertrafo)( @@ -1090,7 +1097,7 @@ void PX(execute_sertrafo)( if(!myrank) fprintf(stderr, "\n"); if(!myrank){ - if(ths->plan[0].plan != NULL){ + if(ths->plan[0].fftw != NULL){ fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, plan0\n", counter); print_dbg(ths->dbg[0]); } else @@ -1102,11 +1109,11 @@ void PX(execute_sertrafo)( /* Checksum inputs */ lsum=0.0; - if(ths->plan[0].plan != NULL) + if(ths->plan[0].fftw != NULL) for(INT k=0; kdbg[0]->in[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); - if(ths->plan[0].plan != NULL) + if(ths->plan[0].fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(in0) = %e\n", counter, gsum); // #if PFFT_DEBUG_GVARS @@ -1148,16 +1155,16 @@ void PX(execute_sertrafo)( // #endif /* Serial trafo */ - if(ths->plan[0].plan != NULL) { + if(ths->plan[0].fftw != NULL) { PX(execute_fftw_plan)(&ths->plan[0], planned_in, planned_out, executed_in, executed_out); } /* Checksum outputs */ lsum=0.0; - if(ths->plan[0].plan != NULL) + if(ths->plan[0].fftw != NULL) for(INT k=0; kdbg[0]->out[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); - if(ths->plan[0].plan != NULL) + if(ths->plan[0].fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(out0) = %e - Value may change\n", counter, gsum); // #if PFFT_DEBUG_GVARS @@ -1202,7 +1209,7 @@ void PX(execute_sertrafo)( if(!myrank) fprintf(stderr, "\n"); if(!myrank){ - if(ths->plan[1].plan != NULL){ + if(ths->plan[1].fftw != NULL){ fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, plan1\n", counter); print_dbg(ths->dbg[1]); } else @@ -1213,24 +1220,24 @@ void PX(execute_sertrafo)( /* Checksum inputs */ lsum=0.0; - if(ths->plan[1].plan != NULL) + if(ths->plan[1].fftw != NULL) for(INT k=0; kdbg[1]->in[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); - if(ths->plan[1].plan != NULL) + if(ths->plan[1].fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(in1) = %e - Value may change.\n", counter, gsum); /* Serial trafo */ - if(ths->plan[1].plan != NULL) { + if(ths->plan[1].fftw != NULL) { PX(execute_fftw_plan)(&ths->plan[1], planned_in, planned_out, executed_in, executed_out); } /* Checksum outputs */ lsum=0.0; - if(ths->plan[1].plan != NULL) + if(ths->plan[1].fftw != NULL) for(INT k=0; kdbg[1]->out[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); - if(ths->plan[1].plan != NULL) + if(ths->plan[1].fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_SERTRAFO: counter = %d, Checksum(out1) = %e\n", counter, gsum); @@ -1238,7 +1245,7 @@ void PX(execute_sertrafo)( #else /* execute all initialized serfft plans */ for(int t=0; t<2; t++) { - if(ths->plan[t].plan != NULL) + if(ths->plan[t].fftw != NULL) PX(execute_fftw_plan)(&ths->plan[t], planned_in, planned_out, executed_in, executed_out); } #endif diff --git a/kernel/timer.c b/kernel/timer.c index 0c47e2b..374076d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -232,7 +232,7 @@ PX(timer) PX(copy_timer)( for(int t=0; trnk_remap; t++) copy->remap[t] = orig->remap[t]; for(int t=0; t<2; t++) - copy->remap_3dto2d[t] = orig->remap_3dto2d[t]; + copy->remap_nd[t] = orig->remap_nd[t]; copy->itwiddle = orig->itwiddle; copy->otwiddle = orig->otwiddle; @@ -252,7 +252,7 @@ void PX(average_timer)( for(int t=0; trnk_remap; t++) ths->remap[t] /= ths->iter; for(int t=0; t<2; t++) - ths->remap_3dto2d[t] /= ths->iter; + ths->remap_nd[t] /= ths->iter; ths->itwiddle /= ths->iter; ths->otwiddle /= ths->iter; @@ -272,7 +272,7 @@ PX(timer) PX(add_timers)( for(int t=0; trnk_remap; t++) res->remap[t] += sum2->remap[t]; for(int t=0; t<2; t++) - res->remap_3dto2d[t] += sum2->remap_3dto2d[t]; + res->remap_nd[t] += sum2->remap_nd[t]; res->itwiddle += sum2->itwiddle; res->otwiddle += sum1->otwiddle; @@ -311,7 +311,7 @@ double* PX(convert_timer2vec)( for(int t=0; trnk_remap; t++) times[m++] = ths->remap[t]; for(int t=0; t<2; t++) - times[m++] = ths->remap_3dto2d[t]; + times[m++] = ths->remap_nd[t]; times[m++] = ths->itwiddle; times[m++] = ths->otwiddle; @@ -332,7 +332,7 @@ PX(timer) PX(convert_vec2timer)( for(int t=0; trnk_remap; t++) ths->remap[t] = times[m++]; for(int t=0; t<2; t++) - ths->remap_3dto2d[t] = times[m++]; + ths->remap_nd[t] = times[m++]; ths->itwiddle = times[m++]; ths->otwiddle = times[m++]; @@ -350,7 +350,7 @@ static void reset_timer( for(int t=0; trnk_remap; t++) ths->remap[t] = 0; for(int t=0; t<2; t++) - ths->remap_3dto2d[t] = 0; + ths->remap_nd[t] = 0; ths->itwiddle = 0; ths->otwiddle = 0; } @@ -379,7 +379,7 @@ static size_t length( /* +3 for rnk_pm, rnk_trafo, rnk_remap */ /* +1 for number of iterations */ /* +1 for whole trafo timer */ - /* +2 for remap_3dto2d[2] */ + /* +2 for remap_nd[2] */ /* +2 for itwiddle, otwiddle */ return (size_t) (ths->rnk_trafo + ths->rnk_remap + 9); } @@ -416,8 +416,8 @@ static void fprint_average_timer_prefixed( /* print times of transposed out step */ PX(fprintf)(comm, file, "%s_itwiddle(%d) = %.3e;\n", prefix, idx, mt->itwiddle); - PX(fprintf)(comm, file, "%s_remap_3dto2d(%d, 2) = %.3e;\n", - prefix, idx, mt->remap_3dto2d[0]); + PX(fprintf)(comm, file, "%s_remap_nd(%d, 2) = %.3e;\n", + prefix, idx, mt->remap_nd[0]); for(int t=0; trnk_pm; t++, k++, l++){ PX(fprintf)(comm, file, "%s_trafo%d(%d, 2) = %.3e; ", prefix, k+1, idx, mt->trafo[k]); @@ -438,7 +438,7 @@ static void fprint_average_timer_prefixed( PX(fprintf)(comm, file, "%s_trafo%d(%d, 2) = %.3e;\n", prefix, k+1, idx, mt->trafo[k]); PX(fprintf)(comm, file, "%s_remap_2dto3d(%d, 2) = %.3e;\n", - prefix, idx, mt->remap_3dto2d[1]); + prefix, idx, mt->remap_nd[1]); PX(fprintf)(comm, file, "%s_otwiddle(%d) = %.3e;\n", prefix, idx, mt->otwiddle); } diff --git a/kernel/transpose.c b/kernel/transpose.c index 72931f1..b9d3cf8 100644 --- a/kernel/transpose.c +++ b/kernel/transpose.c @@ -171,11 +171,11 @@ gtransp_plan PX(plan_global_transp)( ths->dbg = gtransp_mkdbg(N, hm, blk, in, out, comm, fftw_flags); #endif - ths->plan.plan = XM(plan_many_transpose)( + ths->plan.fftw = XM(plan_many_transpose)( N[0], N[1], hm, blk[0], blk[1], in, out, comm, fftw_flags); - ths->plan.planned_in = in; - ths->plan.planned_out = out; + ths->plan.in = in; + ths->plan.out = out; ths->plan.execute = (PX(fftw_execute))(XM(execute_r2r)); return ths; } @@ -188,7 +188,7 @@ static gtransp_plan gtransp_mkplan( gtransp_plan ths = (gtransp_plan) malloc(sizeof(gtransp_plan_s)); /* initialize to NULL for easy checks */ - ths->plan.plan=NULL; + ths->plan.fftw = NULL; /* initialize debug info */ #if PFFT_DEBUG_GTRANSP @@ -207,8 +207,8 @@ void PX(gtransp_rmplan)( return; /* take care of unsuccessful FFTW planing */ - if(ths->plan.plan != NULL) - X(destroy_plan)(ths->plan.plan); + if(ths->plan.fftw != NULL) + X(destroy_plan)(ths->plan.fftw); #if PFFT_DEBUG_GTRANSP if(ths->dbg != NULL) @@ -237,7 +237,7 @@ void PX(execute_gtransp)( if(!myrank) fprintf(stderr, "\n"); if(!myrank){ if(ths != NULL){ - if(ths->plan.plan != NULL){ + if(ths->plan.fftw != NULL){ fprintf(stderr, "PFFT_DBG_GTRANSP: counter = %d\n", counter); print_dbg(ths->dbg); } else @@ -250,18 +250,18 @@ void PX(execute_gtransp)( /* Checksum inputs */ lsum=0.0; if(ths != NULL) - if(ths->plan.plan != NULL) + if(ths->plan.fftw != NULL) for(INT k=0; kdbg->in[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); if(ths != NULL) - if(ths->plan.plan != NULL) + if(ths->plan.fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_GTRANSP: counter = %d, Checksum(in) = %e\n", counter, gsum); #endif /* Global transposition */ if(ths != NULL) - if(ths->plan.plan != NULL) { + if(ths->plan.fftw != NULL) { PX(execute_fftw_plan)(&ths->plan, planned_in, planned_out, executed_in, executed_out); } @@ -271,12 +271,12 @@ void PX(execute_gtransp)( /* Checksum outputs */ lsum=0.0; if(ths != NULL) - if(ths->plan.plan != NULL) + if(ths->plan.fftw != NULL) for(INT k=0; kdbg->out[k]); MPI_Reduce(&lsum, &gsum, 1, PFFT_MPI_REAL_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); if(ths != NULL) - if(ths->plan.plan != NULL) + if(ths->plan.fftw != NULL) if(!myrank) fprintf(stderr, "PFFT_DBG_GTRANSP: counter = %d, Checksum(out) = %e\n", counter, gsum); // if(counter==3){ diff --git a/tests/Makefile.am b/tests/Makefile.am index 9cdef9f..2689a7c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -19,12 +19,18 @@ AM_CPPFLAGS = -I$(top_srcdir)/kernel # Directory of pfft.h AM_CPPFLAGS += -I$(top_srcdir)/api +# +# Directory of fftw3 +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi # OpenMP support AM_CFLAGS = $(OPENMP_CFLAGS) # Libraries to add to all programs that are built. -LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la $(fftw3_openmp_LIBS) $(fftw3_mpi_LIBS) $(fftw3_LIBS) +LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/libfftw3@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/mpi/libfftw3@PREC_SUFFIX@_mpi.la EXTRA_DIST = \ @@ -100,6 +106,10 @@ check_PROGRAMS += \ check_PROGRAMS += \ simple_check_r2c_3d_on_3d_transposed_many +check_PROGRAMS += \ + simple_check_r2c_2d_on_2d \ + simple_check_c2c_2d_on_2d + check_PROGRAMS += \ simple_check_ousam_c2c simple_check_ousam_c2c_transposed \ simple_check_ousam_c2r \ diff --git a/tests/f03/Makefile.am b/tests/f03/Makefile.am index d5e7945..6122e10 100644 --- a/tests/f03/Makefile.am +++ b/tests/f03/Makefile.am @@ -3,12 +3,18 @@ include $(top_srcdir)/build-aux/fortran-rules.am # Directory of pnfft.f (which is build first) AM_FCCPPFLAGS = -I$(top_builddir)/api +# +# Directory of fftw3 +AM_FCCPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_FCCPPFLAGS += -I$(top_srcdir)/fftw3/mpi # OpenMP support AM_FCFLAGS = $(OPENMP_FCFLAGS) # Libraries to add to all programs that are built. LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la $(fftw3_openmp_LIBS) $(fftw3_mpi_LIBS) $(fftw3_LIBS) +LDADD += $(top_builddir)/fftw3/libfftw3@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/mpi/libfftw3@PREC_SUFFIX@_mpi.la # noinst_LTLIBRARIES = libtests.la diff --git a/tests/fortran/Makefile.am b/tests/fortran/Makefile.am index 88b9a98..6adee68 100644 --- a/tests/fortran/Makefile.am +++ b/tests/fortran/Makefile.am @@ -4,11 +4,20 @@ include $(top_srcdir)/build-aux/fortran-rules.am # Directory of pfft.f (which is build first) AM_FCCPPFLAGS = -I$(top_builddir)/api +# Directory of pfft.f (which is build first) +AM_FCCPPFLAGS = -I$(top_builddir)/api + +# Directory of fftw3 +AM_FCCPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_FCCPPFLAGS += -I$(top_srcdir)/fftw3/mpi + # OpenMP support AM_FCFLAGS = $(OPENMP_FCFLAGS) # Libraries to add to all programs that are built. -LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la $(fftw3_openmp_LIBS) $(fftw3_mpi_LIBS) $(fftw3_LIBS) +LDADD = $(top_builddir)/lib@PFFT_PREFIX@pfft@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/libfftw3@PREC_SUFFIX@@OPENMP_SUFFIX@.la +LDADD += $(top_builddir)/fftw3/mpi/libfftw3@PREC_SUFFIX@_mpi.la # noinst_LTLIBRARIES = libtests.la diff --git a/tests/simple_check_c2c_2d_on_2d.c b/tests/simple_check_c2c_2d_on_2d.c new file mode 100644 index 0000000..bbebaeb --- /dev/null +++ b/tests/simple_check_c2c_2d_on_2d.c @@ -0,0 +1,107 @@ +#include +#include + +int main(int argc, char **argv) +{ + int np[2]; + ptrdiff_t n[2]; + ptrdiff_t alloc_local; + ptrdiff_t local_ni[2], local_i_start[2]; + ptrdiff_t local_no[2], local_o_start[2]; + ptrdiff_t global_ni[2]; + ptrdiff_t global_no[2]; + + double err; + pfft_complex *in, *out; + pfft_plan plan_forw=NULL, plan_back=NULL; + MPI_Comm comm_cart_2d; + + /* Set size of FFT and process mesh */ + n[0] = 4; n[1] = 4; + np[0] = 2; np[1] = 1; + global_no[0] = n[0]; + global_no[1] = n[1]; + global_ni[0] = n[0]; + global_ni[1] = n[1]; + + double *truein; + double *trueout; + int pid; + + /* Initialize MPI and PFFT */ + MPI_Init(&argc, &argv); + pfft_init(); + + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + /* Create three-dimensional process grid of size np[0] x np[1], if possible */ + if( pfft_create_procmesh(2, MPI_COMM_WORLD, np, &comm_cart_2d) ){ + pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]); + MPI_Finalize(); + return 1; + } + + /* Get parameters of data distribution */ + alloc_local = pfft_local_size_dft(2, n, comm_cart_2d, PFFT_TRANSPOSED_NONE, + local_ni, local_i_start, local_no, local_o_start); + + /* Allocate memory */ + in = pfft_alloc_complex(alloc_local); + out = pfft_alloc_complex(alloc_local); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning forward\n"); + /* Plan parallel forward FFT */ + plan_forw = pfft_plan_dft(2, + n, in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning backward\n"); + /* Plan parallel backward FFT */ + plan_back = pfft_plan_dft(2, + n, out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); + + /* Initialize input with random numbers */ + pfft_init_input_complex(2, n, local_ni, local_i_start, + in); + + truein = pfft_gather_array(2, 2, (double*) in, local_i_start, local_ni, global_ni, MPI_COMM_WORLD); + pfft_print_array(2, 2, truein, global_ni, MPI_COMM_WORLD); + pfft_free(truein); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "running forward\n"); + /* execute parallel forward FFT */ + pfft_execute(plan_forw); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "as complex \n"); + + trueout = pfft_gather_array(2, 2, (double*) out, local_o_start, local_no, global_no, MPI_COMM_WORLD); + pfft_print_array(2, 2, trueout, global_no, MPI_COMM_WORLD); + pfft_free(trueout); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "running backward\n"); + /* clear the old input */ + pfft_clear_input_complex(2, n, local_ni, local_i_start, + in); + /* execute parallel backward FFT */ + pfft_execute(plan_back); + + /* Scale data */ + for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) + in[l] /= (n[0]*n[1]); + + truein = pfft_gather_array(2, 2, (double*) in, local_i_start, local_ni, global_ni, MPI_COMM_WORLD); + pfft_print_array(2, 2, truein, global_ni, MPI_COMM_WORLD); + pfft_free(truein); + + /* Print error of back transformed data */ + MPI_Barrier(MPI_COMM_WORLD); + err = pfft_check_output_complex(2, n, local_ni, local_i_start, in, comm_cart_2d); + pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td):\n", n[0], n[1]); + pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err); + + /* free mem and finalize */ + pfft_destroy_plan(plan_forw); + pfft_destroy_plan(plan_back); + MPI_Comm_free(&comm_cart_2d); + pfft_free(in); pfft_free(out); + MPI_Finalize(); + return 0; +} diff --git a/tests/simple_check_c2c_3d_on_3d.c b/tests/simple_check_c2c_3d_on_3d.c index 5de88c0..934ab87 100644 --- a/tests/simple_check_c2c_3d_on_3d.c +++ b/tests/simple_check_c2c_3d_on_3d.c @@ -14,8 +14,8 @@ int main(int argc, char **argv) MPI_Comm comm_cart_3d; /* Set size of FFT and process mesh */ - n[0] = 29; n[1] = 27; n[2] = 31; - np[0] = 2; np[1] = 2; np[2] = 2; + n[0] = 4; n[1] = 4; n[2] = 2; + np[0] = 1; np[1] = 1; np[2] = 2; /* Initialize MPI and PFFT */ MPI_Init(&argc, &argv); @@ -54,7 +54,11 @@ int main(int argc, char **argv) /* clear the old input */ pfft_clear_input_complex_3d(n, local_ni, local_i_start, in); - + + double * gout = pfft_gather_array(3, 2, (double*) out, local_i_start, local_ni, n, MPI_COMM_WORLD); + pfft_print_array(3, 2, gout, n, MPI_COMM_WORLD); + pfft_free(gout); + /* execute parallel backward FFT */ pfft_execute(plan_back); diff --git a/tests/simple_check_r2c_2d_on_2d.c b/tests/simple_check_r2c_2d_on_2d.c new file mode 100644 index 0000000..ccff241 --- /dev/null +++ b/tests/simple_check_r2c_2d_on_2d.c @@ -0,0 +1,160 @@ +#include +#include + +static void printarray(double * a, int n0, int n1) +{ + for(ptrdiff_t l=0; l < n0 * n1; l++) { +// pfft_fprintf(MPI_COMM_WORLD, stderr, "%04.0f, ", a[l]); +// pfft_fprintf(MPI_COMM_WORLD, stderr, "%05.1f, ", a[l]); + fprintf(stderr, "%g, ", a[l]); + if ((l + 1) % n1 == 0) { + fprintf(stderr, "\n", a[l]); + } + } +} +static void gatherarray(double * a, double * in, int local_start0, + int local_n0, int n0, int local_start1, int local_n1, int n1) +{ + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + memset(a, 0, sizeof(double) * n0 * n1); + for(ptrdiff_t l=0; l < local_n0 * local_n1; l++) { + int x = l / local_n1, y = l % local_n1; + a[(local_start0 + x) * n1 + local_start1 + y] = in[l]; +// + pid * 1000; + } + MPI_Allreduce(MPI_IN_PLACE, a, n0 * n1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); +} + +int main(int argc, char **argv) +{ + int np[2]; + ptrdiff_t n[2]; + ptrdiff_t alloc_local; + ptrdiff_t local_ni[2], local_i_start[2]; + ptrdiff_t local_no[2], local_o_start[2]; + double err, *in; + pfft_complex *out; + pfft_plan plan_forw=NULL, plan_back=NULL; + MPI_Comm comm_cart_2d; + + /* Set size of FFT and process mesh */ + n[0] = 4; n[1] = 4; + np[0] = 1; np[1] = 2; + + double *truein = calloc(sizeof(double), n[0] * n[1]); + double *trueout = calloc(sizeof(double), n[0] * (n[1] / 2 + 1) * 2); + int pid; + + /* Initialize MPI and PFFT */ + MPI_Init(&argc, &argv); + pfft_init(); + + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + /* Create three-dimensional process grid of size np[0] x np[1], if possible */ + if( pfft_create_procmesh(2, MPI_COMM_WORLD, np, &comm_cart_2d) ){ + pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]); + MPI_Finalize(); + return 1; + } + + /* Get parameters of data distribution */ + alloc_local = pfft_local_size_dft_r2c(2, n, comm_cart_2d, PFFT_TRANSPOSED_NONE, + local_ni, local_i_start, local_no, local_o_start); + + /* Allocate memory */ + in = pfft_alloc_real(2 * alloc_local); + out = pfft_alloc_complex(alloc_local); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning forward\n"); + /* Plan parallel forward FFT */ + plan_forw = pfft_plan_dft_r2c(2, + n, in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "planning backward\n"); + /* Plan parallel backward FFT */ + plan_back = pfft_plan_dft_c2r(2, + n, out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); + + /* Initialize input with random numbers */ + pfft_init_input_real(2, n, local_ni, local_i_start, + in); + + for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) { + // in[l] = pid * 10 + l; + // if(l == 1) + // in[l] = 1; + } + gatherarray(truein, in, local_i_start[0], local_ni[0], n[0], + local_i_start[1], local_ni[1], n[1]); + if(pid == 0) + printarray(truein, n[0], n[1]); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "running forward\n"); + /* execute parallel forward FFT */ + pfft_execute(plan_forw); + + + pfft_fprintf(MPI_COMM_WORLD, stderr, "as real \n"); + gatherarray(truein, out, local_i_start[0], local_ni[0], n[0], + local_i_start[1], local_ni[1], n[1]); + if(pid == 0) + printarray(truein, n[0], n[1]); + + + pfft_fprintf(MPI_COMM_WORLD, stderr, "as complex \n"); + + int kk = 0; + for(kk = 0; kk < n[1] * n[0]; kk++) { + MPI_Barrier(MPI_COMM_WORLD); + if(kk == pid) { + fprintf(stderr, "pid = %d, %td %td %td %td\n", pid, + local_o_start[0], local_o_start[1], + local_no[0], local_no[1]); + + printarray(out, local_no[0], local_no[1] * 2); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + gatherarray(trueout, out, local_o_start[0], local_no[0], n[0], + 2 * local_o_start[1], + 2 * local_no[1], 2 *(n[1] / 2 + 1)); + + + pfft_fprintf(MPI_COMM_WORLD, stderr, "global result\n"); + + if(pid == 0) + printarray(trueout, n[0], (n[1] / 2 + 1) * 2); + + pfft_fprintf(MPI_COMM_WORLD, stderr, "running backward\n"); + /* clear the old input */ + pfft_clear_input_real(2, n, local_ni, local_i_start, + in); + /* execute parallel backward FFT */ + pfft_execute(plan_back); + + /* Scale data */ + for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1]; l++) + in[l] /= (n[0]*n[1]); + + gatherarray(truein, in, local_i_start[0], local_ni[0], n[0], + local_i_start[1], local_ni[1], n[1]); + if(pid == 0) + printarray(truein, n[0], n[1]); + + /* Print error of back transformed data */ + MPI_Barrier(MPI_COMM_WORLD); + err = pfft_check_output_real(2, n, local_ni, local_i_start, in, comm_cart_2d); + pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td):\n", n[0], n[1]); + pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err); + + /* free mem and finalize */ + pfft_destroy_plan(plan_forw); + pfft_destroy_plan(plan_back); + MPI_Comm_free(&comm_cart_2d); + pfft_free(in); pfft_free(out); + MPI_Finalize(); + return 0; +} diff --git a/tests/simple_check_r2c_3d_on_3d.c b/tests/simple_check_r2c_3d_on_3d.c index 8007f3f..f722c24 100644 --- a/tests/simple_check_r2c_3d_on_3d.c +++ b/tests/simple_check_r2c_3d_on_3d.c @@ -14,7 +14,7 @@ int main(int argc, char **argv) MPI_Comm comm_cart_3d; /* Set size of FFT and process mesh */ - n[0] = 29; n[1] = 27; n[2] = 31; + n[0] = 8; n[1] = 8; n[2] = 8; np[0] = 2; np[1] = 2; np[2] = 2; /* Initialize MPI and PFFT */ @@ -38,11 +38,11 @@ int main(int argc, char **argv) /* Plan parallel forward FFT */ plan_forw = pfft_plan_dft_r2c_3d( - n, in, out, comm_cart_3d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + n, in, out, comm_cart_3d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); /* Plan parallel backward FFT */ plan_back = pfft_plan_dft_c2r_3d( - n, out, in, comm_cart_3d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT); + n, out, in, comm_cart_3d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_ESTIMATE| PFFT_DESTROY_INPUT); /* Initialize input with random numbers */ pfft_init_input_real(3, n, local_ni, local_i_start, diff --git a/util/Makefile.am b/util/Makefile.am index 9b70925..2885f4e 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -3,6 +3,10 @@ AM_CPPFLAGS = -I$(top_srcdir)/kernel # Directory of pfft.h AM_CPPFLAGS += -I$(top_srcdir)/api +# +# Directory of fftw3 +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/api +AM_CPPFLAGS += -I$(top_srcdir)/fftw3/mpi noinst_LTLIBRARIES = libutil.la diff --git a/util/util.c b/util/util.c index 924a16e..17fcabc 100644 --- a/util/util.c +++ b/util/util.c @@ -248,14 +248,69 @@ unsigned* PX(malloc_unsigned)( return (unsigned*) malloc(sizeof(unsigned) * size); } -int PX(needs_3dto2d_remap)( - int rnk_n, MPI_Comm comm_cart - ) +R * PX(gather_array)(const int rnk_n, const int howmany, + const R * local, + const INT* local_start, + const INT * local_n, + const INT * global_n, + MPI_Comm comm) { - int rnk_pm; - - MPI_Cartdim_get(comm_cart, &rnk_pm); + INT * global_strides = malloc(sizeof(INT) * rnk_n); + INT * offset = malloc(sizeof(INT) * rnk_n); + INT * local_strides = malloc(sizeof(INT) * rnk_n); + + size_t gn = 1, ln = 1; + int i; + for(i = rnk_n - 1; i >=0 ; i --) { + global_strides[i] = gn; + local_strides[i] = ln; + gn *= global_n[i]; + ln *= local_n[i]; + } + R * r = PX(malloc)(sizeof(R) * gn * howmany); + memset(r, 0, (sizeof(R) * gn * howmany)); + for(i = 0; i < ln; i ++) { + INT gi = 0; + INT li = i; + int d; + for(d = 0; d < rnk_n ; d++) { + offset[d] = 0; + offset[d] = (li / local_strides[d]) % local_n[d]; + gi += (offset[d] + local_start[d]) * global_strides[d]; + } + + for(d = 0; d < howmany; d ++) { + r[gi * howmany + d] = local[li * howmany + d]; + } + // printf("%td -> %td (%td) %g\n", li, gi, gn, local[li * howmany]); + } + MPI_Allreduce(MPI_IN_PLACE, r, gn * howmany, PFFT_MPI_REAL_TYPE, MPI_SUM, comm); + free(local_strides); + free(offset); + return r; +} - return (rnk_n == 3) && (rnk_pm == 3); +void PX(print_array)(int rnk_n, + int howmany, + R * a, + const INT * ni, + MPI_Comm comm) +{ + int i; + size_t gn = 1; + for(i = 0; i < rnk_n; i ++) { + gn *= ni[i]; + } + size_t nl = gn / ni[0]; /* line break after first dim */ + + for(ptrdiff_t l=0; l < gn; l++) { + int d; + for(d = 0; d < howmany; d ++) { + pfft_fprintf(comm, stderr, "%g, ", a[l * howmany + d]); + } + if ((l + 1) % nl == 0) { + pfft_fprintf(MPI_COMM_WORLD, stderr, "\n"); + } + } } diff --git a/util/util.h b/util/util.h index 0a95d7d..4ab38eb 100644 --- a/util/util.h +++ b/util/util.h @@ -61,8 +61,5 @@ int* PX(malloc_int)( unsigned* PX(malloc_unsigned)( size_t size); -int PX(needs_3dto2d_remap)( - int rnk_n, MPI_Comm comm_cart); - #endif /* !PFFT_UTIL_H */