From e8f9c616fe14ff45f710490f1854717bf5277722 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 5 May 2017 16:25:17 -0700 Subject: [PATCH 01/29] Update ORTE to include all fixes since v3.x branching Remove the RML/OFI component and the timing macros from ORTE Update PMIx to match that in master. Note this is not the full PMIx v2.0 as that code isn't complete Bring across remaining required changes in OPAL constants and cmd line processor Signed-off-by: Ralph Castain --- opal/include/opal/constants.h | 7 +- opal/mca/pmix/base/pmix_base_frame.c | 2 +- opal/mca/pmix/ext2x/pmix2x.c | 4 +- opal/mca/pmix/pmix2x/configure.m4 | 16 +- opal/mca/pmix/pmix2x/pmix/NEWS | 29 +- opal/mca/pmix/pmix2x/pmix/VERSION | 4 +- opal/mca/pmix/pmix2x/pmix/config/pmix.m4 | 30 +- .../pmix/pmix2x/pmix/config/pmix_functions.m4 | 8 +- .../mca/pmix/pmix2x/pmix/examples/Makefile.am | 7 +- opal/mca/pmix/pmix2x/pmix/examples/debugger.c | 3 +- opal/mca/pmix/pmix2x/pmix/examples/jctrl.c | 229 +++ opal/mca/pmix/pmix2x/pmix/include/Makefile.am | 8 +- opal/mca/pmix/pmix2x/pmix/include/pmix.h | 59 + .../pmix/pmix2x/pmix/include/pmix_common.h | 193 ++- .../pmix/pmix2x/pmix/include/pmix_server.h | 16 +- opal/mca/pmix/pmix2x/pmix/src/Makefile.am | 14 +- .../pmix2x/pmix/src/client/Makefile.include | 4 +- .../pmix2x/pmix/src/client/pmix_client_get.c | 2 +- .../pmix2x/pmix/src/common/Makefile.include | 5 +- .../pmix2x/pmix/src/common/pmix_control.c | 276 ++++ .../pmix/pmix2x/pmix/src/event/pmix_event.h | 56 +- .../pmix/src/event/pmix_event_notification.c | 458 +++--- .../pmix/src/event/pmix_event_registration.c | 643 ++++++--- .../pmix2x/pmix/src/include/pmix_globals.c | 2 + .../pmix2x/pmix/src/include/pmix_globals.h | 6 +- opal/mca/pmix/pmix2x/pmix/src/include/types.h | 9 + .../pmix2x/pmix/src/mca/psensor}/Makefile.am | 19 +- .../pmix/src/mca/psensor}/base/Makefile.am | 13 +- .../pmix2x/pmix/src/mca/psensor/base/base.h | 59 + .../src/mca/psensor/base/psensor_base_frame.c | 103 ++ .../mca/psensor/base/psensor_base_select.c | 94 ++ .../src/mca/psensor/base/psensor_base_stubs.c | 66 + .../pmix/src/mca/psensor}/file/Makefile.am | 30 +- .../psensor/file/help-pmix-psensor-file.txt | 4 +- .../pmix/src/mca/psensor/file/psensor_file.c | 350 +++++ .../pmix/src/mca/psensor/file/psensor_file.h | 38 + .../mca/psensor/file/psensor_file_component.c | 69 + .../src/mca/psensor/heartbeat/Makefile.am | 38 + .../heartbeat/help-pmix-psensor-heartbeat.txt | 5 +- .../mca/psensor/heartbeat/psensor_heartbeat.c | 330 +++++ .../mca/psensor/heartbeat/psensor_heartbeat.h | 43 + .../heartbeat/psensor_heartbeat_component.c | 81 ++ .../pmix2x/pmix/src/mca/psensor/psensor.h | 86 ++ .../pmix/pmix2x/pmix/src/mca/ptl/base/base.h | 8 +- .../pmix/src/mca/ptl/base/ptl_base_frame.c | 4 + .../pmix/src/mca/ptl/base/ptl_base_sendrecv.c | 50 +- .../pmix/src/mca/ptl/base/ptl_base_stubs.c | 91 +- opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl.h | 15 +- .../pmix/pmix2x/pmix/src/mca/ptl/ptl_types.h | 10 + .../pmix2x/pmix/src/runtime/pmix_params.c | 13 +- .../pmix/src/runtime/pmix_progress_threads.h | 7 +- .../pmix/pmix2x/pmix/src/server/pmix_server.c | 24 +- .../pmix2x/pmix/src/server/pmix_server_get.c | 16 +- .../pmix2x/pmix/src/server/pmix_server_ops.c | 198 ++- .../pmix2x/pmix/src/server/pmix_server_ops.h | 10 + opal/mca/pmix/pmix2x/pmix/src/util/compress.h | 2 +- opal/mca/pmix/pmix2x/pmix/src/util/error.c | 10 + opal/mca/pmix/pmix2x/pmix/src/util/error.h | 1 + opal/mca/pmix/pmix2x/pmix/test/Makefile.am | 11 +- opal/mca/pmix/pmix2x/pmix2x.c | 38 + opal/mca/pmix/pmix2x/pmix2x.h | 11 + opal/mca/pmix/pmix2x/pmix2x_component.c | 2 + opal/mca/pmix/pmix2x/pmix2x_server_north.c | 251 +++- opal/mca/pmix/pmix_server.h | 17 +- opal/mca/pmix/pmix_types.h | 132 +- opal/util/cmd_line.c | 345 +++-- opal/util/cmd_line.h | 36 +- orte/mca/ess/alps/ess_alps_module.c | 16 +- orte/mca/ess/base/base.h | 4 +- orte/mca/ess/base/ess_base_std_orted.c | 43 +- orte/mca/ess/env/ess_env_module.c | 13 +- orte/mca/ess/lsf/ess_lsf_module.c | 13 +- orte/mca/ess/slurm/ess_slurm_module.c | 17 +- orte/mca/ess/tm/ess_tm_module.c | 14 +- orte/mca/grpcomm/direct/grpcomm_direct.c | 24 +- orte/mca/odls/alps/odls_alps_module.c | 16 + orte/mca/odls/base/help-orte-odls-base.txt | 2 + orte/mca/odls/base/odls_base_default_fns.c | 120 +- .../odls/default/help-orte-odls-default.txt | 2 + orte/mca/odls/default/odls_default_module.c | 13 +- orte/mca/odls/odls_types.h | 3 + orte/mca/oob/base/base.h | 2 - orte/mca/oob/base/oob_base_frame.c | 7 - orte/mca/oob/tcp/oob_tcp_component.c | 8 +- orte/mca/oob/tcp/oob_tcp_sendrecv.c | 25 - orte/mca/plm/alps/help-plm-alps.txt | 5 +- orte/mca/plm/alps/plm_alps.h | 2 +- orte/mca/plm/alps/plm_alps_component.c | 8 +- orte/mca/plm/alps/plm_alps_module.c | 88 +- orte/mca/plm/base/plm_base_launch_support.c | 193 ++- orte/mca/plm/base/plm_private.h | 3 +- orte/mca/plm/lsf/plm_lsf_module.c | 11 +- orte/mca/plm/rsh/plm_rsh_module.c | 35 +- orte/mca/plm/slurm/help-plm-slurm.txt | 17 +- orte/mca/plm/slurm/plm_slurm.h | 2 + orte/mca/plm/slurm/plm_slurm_component.c | 10 + orte/mca/plm/slurm/plm_slurm_module.c | 34 +- orte/mca/plm/tm/plm_tm_module.c | 28 +- orte/mca/ras/alps/ras_alps_module.c | 22 - orte/mca/ras/loadleveler/Makefile.am | 53 - orte/mca/ras/loadleveler/configure.m4 | 40 - orte/mca/ras/loadleveler/owner.txt | 7 - orte/mca/ras/loadleveler/ras_loadleveler.h | 37 - .../loadleveler/ras_loadleveler_component.c | 105 -- .../ras/loadleveler/ras_loadleveler_module.c | 191 --- orte/mca/rmaps/base/rmaps_base_support_fns.c | 89 +- orte/mca/rmaps/ppr/rmaps_ppr.c | 76 +- orte/mca/rmaps/rmaps_types.h | 4 +- orte/mca/rml/base/base.h | 2 - orte/mca/rml/base/rml_base_frame.c | 4 - orte/mca/rml/base/rml_base_msg_handlers.c | 3 - orte/mca/rml/ofi/.opal_ignore | 0 orte/mca/rml/ofi/.opal_unignore | 2 - orte/mca/rml/ofi/Makefile.am | 49 - orte/mca/rml/ofi/configure.m4 | 29 - orte/mca/rml/ofi/rml_ofi.h | 205 --- orte/mca/rml/ofi/rml_ofi_component.c | 1248 ----------------- orte/mca/rml/ofi/rml_ofi_request.h | 137 -- orte/mca/rml/ofi/rml_ofi_send.c | 805 ----------- orte/mca/rml/oob/rml_oob_send.c | 4 - orte/mca/routed/direct/routed_direct.c | 7 +- orte/mca/schizo/base/base.h | 2 +- orte/mca/schizo/base/schizo_base_stubs.c | 8 +- orte/mca/schizo/ompi/schizo_ompi.c | 254 ++-- orte/mca/schizo/schizo.h | 2 +- orte/mca/schizo/slurm/schizo_slurm.c | 58 +- .../mca/schizo/slurm/schizo_slurm_component.c | 7 +- orte/mca/sensor/base/base.h | 39 - orte/mca/sensor/base/sensor_base_fns.c | 158 --- orte/mca/sensor/base/sensor_base_frame.c | 133 -- orte/mca/sensor/base/sensor_base_select.c | 219 --- orte/mca/sensor/base/sensor_private.h | 67 - orte/mca/sensor/file/configure.m4 | 24 - orte/mca/sensor/file/sensor_file.c | 354 ----- orte/mca/sensor/file/sensor_file.h | 42 - orte/mca/sensor/file/sensor_file_component.c | 120 -- orte/mca/sensor/ft_tester/Makefile.am | 36 - orte/mca/sensor/ft_tester/configure.m4 | 24 - orte/mca/sensor/ft_tester/sensor_ft_tester.h | 41 - .../ft_tester/sensor_ft_tester_component.c | 141 -- orte/mca/sensor/heartbeat/Makefile.am | 38 - orte/mca/sensor/heartbeat/configure.m4 | 24 - orte/mca/sensor/heartbeat/sensor_heartbeat.c | 279 ---- orte/mca/sensor/heartbeat/sensor_heartbeat.h | 32 - .../heartbeat/sensor_heartbeat_component.c | 75 - orte/mca/sensor/resusage/Makefile.am | 38 - orte/mca/sensor/resusage/configure.m4 | 24 - .../resusage/help-orte-sensor-resusage.txt | 21 - orte/mca/sensor/resusage/sensor_resusage.c | 478 ------- orte/mca/sensor/resusage/sensor_resusage.h | 41 - .../resusage/sensor_resusage_component.c | 138 -- orte/mca/sensor/sensor.h | 107 -- orte/mca/sensor/sensor_types.h | 51 - orte/mca/state/base/state_base_fns.c | 47 +- orte/mca/state/dvm/state_dvm.c | 86 +- orte/mca/state/orted/state_orted.c | 4 +- orte/mca/state/state.h | 25 +- .../sensor_ft_tester.c => orted/ft_tester.c} | 6 +- orte/orted/help-orted.txt | 22 +- orte/orted/orted_comm.c | 130 +- orte/orted/orted_main.c | 52 +- orte/orted/orted_submit.c | 164 +-- orte/orted/orted_submit.h | 4 +- orte/orted/pmix/pmix_server.c | 43 +- orte/orted/pmix/pmix_server_dyn.c | 12 +- orte/orted/pmix/pmix_server_fence.c | 13 +- orte/orted/pmix/pmix_server_gen.c | 71 +- orte/orted/pmix/pmix_server_internal.h | 26 +- orte/orted/pmix/pmix_server_pub.c | 9 +- orte/runtime/orte_globals.c | 2 + orte/runtime/orte_globals.h | 2 + orte/runtime/orte_init.c | 11 +- orte/runtime/orte_quit.c | 8 +- orte/tools/orte-dvm/orte-dvm.c | 43 +- orte/tools/orterun/help-orterun.txt | 2 + orte/tools/orterun/orterun.c | 8 +- orte/util/error_strings.c | 10 +- orte/util/nidmap.c | 816 +++++------ orte/util/nidmap.h | 12 +- orte/util/regex.c | 226 +-- orte/util/regex.h | 8 +- 181 files changed, 5739 insertions(+), 7915 deletions(-) create mode 100644 opal/mca/pmix/pmix2x/pmix/examples/jctrl.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/common/pmix_control.c rename {orte/mca/sensor => opal/mca/pmix/pmix2x/pmix/src/mca/psensor}/Makefile.am (62%) rename {orte/mca/sensor => opal/mca/pmix/pmix2x/pmix/src/mca/psensor}/base/Makefile.am (60%) create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/base.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_frame.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_select.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_stubs.c rename {orte/mca/sensor => opal/mca/pmix/pmix2x/pmix/src/mca/psensor}/file/Makefile.am (50%) rename orte/mca/sensor/file/help-orte-sensor-file.txt => opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/help-pmix-psensor-file.txt (98%) create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file_component.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/Makefile.am rename orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt => opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/help-pmix-psensor-heartbeat.txt (98%) create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat_component.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/mca/psensor/psensor.h delete mode 100644 orte/mca/ras/loadleveler/Makefile.am delete mode 100644 orte/mca/ras/loadleveler/configure.m4 delete mode 100644 orte/mca/ras/loadleveler/owner.txt delete mode 100644 orte/mca/ras/loadleveler/ras_loadleveler.h delete mode 100644 orte/mca/ras/loadleveler/ras_loadleveler_component.c delete mode 100644 orte/mca/ras/loadleveler/ras_loadleveler_module.c delete mode 100644 orte/mca/rml/ofi/.opal_ignore delete mode 100644 orte/mca/rml/ofi/.opal_unignore delete mode 100644 orte/mca/rml/ofi/Makefile.am delete mode 100644 orte/mca/rml/ofi/configure.m4 delete mode 100644 orte/mca/rml/ofi/rml_ofi.h delete mode 100644 orte/mca/rml/ofi/rml_ofi_component.c delete mode 100644 orte/mca/rml/ofi/rml_ofi_request.h delete mode 100644 orte/mca/rml/ofi/rml_ofi_send.c delete mode 100644 orte/mca/sensor/base/base.h delete mode 100644 orte/mca/sensor/base/sensor_base_fns.c delete mode 100644 orte/mca/sensor/base/sensor_base_frame.c delete mode 100644 orte/mca/sensor/base/sensor_base_select.c delete mode 100644 orte/mca/sensor/base/sensor_private.h delete mode 100644 orte/mca/sensor/file/configure.m4 delete mode 100644 orte/mca/sensor/file/sensor_file.c delete mode 100644 orte/mca/sensor/file/sensor_file.h delete mode 100644 orte/mca/sensor/file/sensor_file_component.c delete mode 100644 orte/mca/sensor/ft_tester/Makefile.am delete mode 100644 orte/mca/sensor/ft_tester/configure.m4 delete mode 100644 orte/mca/sensor/ft_tester/sensor_ft_tester.h delete mode 100644 orte/mca/sensor/ft_tester/sensor_ft_tester_component.c delete mode 100644 orte/mca/sensor/heartbeat/Makefile.am delete mode 100644 orte/mca/sensor/heartbeat/configure.m4 delete mode 100644 orte/mca/sensor/heartbeat/sensor_heartbeat.c delete mode 100644 orte/mca/sensor/heartbeat/sensor_heartbeat.h delete mode 100644 orte/mca/sensor/heartbeat/sensor_heartbeat_component.c delete mode 100644 orte/mca/sensor/resusage/Makefile.am delete mode 100644 orte/mca/sensor/resusage/configure.m4 delete mode 100644 orte/mca/sensor/resusage/help-orte-sensor-resusage.txt delete mode 100644 orte/mca/sensor/resusage/sensor_resusage.c delete mode 100644 orte/mca/sensor/resusage/sensor_resusage.h delete mode 100644 orte/mca/sensor/resusage/sensor_resusage_component.c delete mode 100644 orte/mca/sensor/sensor.h delete mode 100644 orte/mca/sensor/sensor_types.h rename orte/{mca/sensor/ft_tester/sensor_ft_tester.c => orted/ft_tester.c} (99%) diff --git a/opal/include/opal/constants.h b/opal/include/opal/constants.h index f05e53b6cdd..6eac3757e2b 100644 --- a/opal/include/opal/constants.h +++ b/opal/include/opal/constants.h @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -94,10 +94,11 @@ enum { OPAL_ERR_PROC_RESTART = (OPAL_ERR_BASE - 63), OPAL_ERR_PROC_CHECKPOINT = (OPAL_ERR_BASE - 64), OPAL_ERR_PROC_MIGRATE = (OPAL_ERR_BASE - 65), - OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66) + OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66), + OPAL_ERR_HEARTBEAT_ALERT = (OPAL_ERR_BASE - 67), + OPAL_ERR_FILE_ALERT = (OPAL_ERR_BASE - 68) }; #define OPAL_ERR_MAX (OPAL_ERR_BASE - 100) #endif /* OPAL_CONSTANTS_H */ - diff --git a/opal/mca/pmix/base/pmix_base_frame.c b/opal/mca/pmix/base/pmix_base_frame.c index ff6ef49c7dc..99d281fe722 100644 --- a/opal/mca/pmix/base/pmix_base_frame.c +++ b/opal/mca/pmix/base/pmix_base_frame.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * diff --git a/opal/mca/pmix/ext2x/pmix2x.c b/opal/mca/pmix/ext2x/pmix2x.c index bb6d37d5240..253276fca6e 100644 --- a/opal/mca/pmix/ext2x/pmix2x.c +++ b/opal/mca/pmix/ext2x/pmix2x.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Mellanox Technologies, Inc. @@ -352,7 +352,7 @@ static void _event_hdlr(int sd, short args, void *cbdata) if (NULL != chain->final_cbfunc) { chain->final_cbfunc(PMIX_SUCCESS, chain->final_cbdata); } - + OBJ_RELEASE(chain); return; diff --git a/opal/mca/pmix/pmix2x/configure.m4 b/opal/mca/pmix/pmix2x/configure.m4 index 928be5e2632..109491d67cc 100644 --- a/opal/mca/pmix/pmix2x/configure.m4 +++ b/opal/mca/pmix/pmix2x/configure.m4 @@ -49,7 +49,19 @@ AC_DEFUN([MCA_opal_pmix_pmix2x_CONFIG],[ opal_pmix_pmix2x_sm_flag=--disable-dstore fi - opal_pmix_pmix2x_args="--with-pmix-symbol-rename=OPAL_MCA_PMIX2X_ $opal_pmix_pmix2x_sm_flag --without-tests-examples --disable-visibility --enable-embedded-libevent --with-libevent-header=\\\"opal/mca/event/$opal_event_base_include\\\"" + AC_ARG_ENABLE([pmix-timing], + [AC_HELP_STRING([--enable-pmix-timing], + [Enable PMIx timing measurements (default: disabled)])]) + AC_MSG_CHECKING([if PMIx timing is enabled]) + if test "$enable_pmix_timing" == "yes"; then + AC_MSG_RESULT([yes]) + opal_pmix_pmix2x_timing_flag=--enable-pmix-timing + else + AC_MSG_RESULT([no (disabled)]) + opal_pmix_pmix2x_timing_flag=--disable-pmix-timing + fi + + opal_pmix_pmix2x_args="--with-pmix-symbol-rename=OPAL_MCA_PMIX2X_ $opal_pmix_pmix2x_sm_flag $opal_pmix_pmix2x_timing_flag --without-tests-examples --disable-pmix-backward-compatibility --disable-visibility --enable-embedded-libevent --with-libevent-header=\\\"opal/mca/event/$opal_event_base_include\\\" --enable-embedded-mode" AS_IF([test "$enable_debug" = "yes"], [opal_pmix_pmix2x_args="--enable-debug $opal_pmix_pmix2x_args" CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"], @@ -57,7 +69,7 @@ AC_DEFUN([MCA_opal_pmix_pmix2x_CONFIG],[ CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS"]) AS_IF([test "$with_devel_headers" = "yes"], [opal_pmix_pmix2x_args="--with-devel-headers $opal_pmix_pmix2x_args"], - [opal_pmix_pmix2x_args="--enable-embedded-mode $opal_pmix_pmix2x_args"]) + [opal_pmix_pmix2x_args=$opal_pmix_pmix2x_args]) CPPFLAGS="-I$OPAL_TOP_SRCDIR -I$OPAL_TOP_BUILDDIR -I$OPAL_TOP_SRCDIR/opal/include -I$OPAL_TOP_BUILDDIR/opal/include $CPPFLAGS" OPAL_CONFIG_SUBDIR([$opal_pmix_pmix2x_basedir/pmix], diff --git a/opal/mca/pmix/pmix2x/pmix/NEWS b/opal/mca/pmix/pmix2x/pmix/NEWS index 688bd14671a..86f4438f1bb 100644 --- a/opal/mca/pmix/pmix2x/pmix/NEWS +++ b/opal/mca/pmix/pmix2x/pmix/NEWS @@ -1,4 +1,5 @@ -Copyright (c) 2015-2016 Intel, Inc. All rights reserved. +Copyright (c) 2015-2017 Intel, Inc. All rights reserved. +Copyright (c) 2017 IBM Corporation. All rights reserved. $COPYRIGHT$ Additional copyrights may follow @@ -23,6 +24,32 @@ current release as well as the "stable" bug fix release branch. Master (not on release branches yet) ------------------------------------ +1.2.2 -- 21 March 2017 +---------------------- +- Compiler fix for Sun/Oracle CC (PR #322) +- Fix missing include (PR #326) +- Improve error checking around posix_fallocate (PR #329) +- Fix possible memory corruption (PR #331) + + +1.2.1 -- 21 Feb. 2017 +---------------------- +- dstore: Fix data corruption bug in key overwrite cases +- dstore: Performance and scalability fixes +- sm: Use posix_fallocate() before mmap +- pmi1/pmi2: Restore support +- dstore: Fix extension slot size allocation (Issue #280) + + +1.2.0 -- 14 Dec. 2016 +---------------------- +- Add shared memory data storage (dstore) option. Default: enabled + Configure option: --disable-dstore +- PMIx_Commit performance improvements +- Disable errhandler support +- Keep job info in the shared memory dstore +- PMIx_Get performance and memory improvements + 1.1.5 ----- - Add pmix_version.h to support direct detection of PMIx library version diff --git a/opal/mca/pmix/pmix2x/pmix/VERSION b/opal/mca/pmix/pmix2x/pmix/VERSION index fee3bc39c4a..b7a91495220 100644 --- a/opal/mca/pmix/pmix2x/pmix/VERSION +++ b/opal/mca/pmix/pmix2x/pmix/VERSION @@ -30,7 +30,7 @@ greek= # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=git4cdd5e0 +repo_rev=git198a2b0 # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Mar 11, 2017" +date="Apr 12, 2017" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix.m4 index bde0572d35a..236a9fd9242 100644 --- a/opal/mca/pmix/pmix2x/pmix/config/pmix.m4 +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix.m4 @@ -950,18 +950,18 @@ AC_MSG_RESULT([$with_ident_string]) # Timing support # AC_MSG_CHECKING([if want developer-level timing support]) -AC_ARG_ENABLE(timing, - AC_HELP_STRING([--enable-timing], - [enable developer-level timing code (default: disabled)])) -if test "$enable_timing" = "yes"; then +AC_ARG_ENABLE(pmix-timing, + AC_HELP_STRING([--enable-pmix-timing], + [enable PMIx developer-level timing code (default: disabled)])) +if test "$enable_pmix_timing" = "yes"; then AC_MSG_RESULT([yes]) - WANT_TIMING=1 + WANT_PMIX_TIMING=1 else AC_MSG_RESULT([no]) - WANT_TIMING=0 + WANT_PMIX_TIMING=0 fi -AC_DEFINE_UNQUOTED([PMIX_ENABLE_TIMING], [$WANT_TIMING], +AC_DEFINE_UNQUOTED([PMIX_ENABLE_TIMING], [$WANT_PMIX_TIMING], [Whether we want developer-level timing support or not]) # @@ -979,6 +979,21 @@ else WANT_INSTALL_HEADERS=0 fi +# +# Install backward compatibility support for PMI-1 and PMI-2 +# +AC_MSG_CHECKING([if want backward compatibility for PMI-1 and PMI-2]) +AC_ARG_ENABLE(pmix-backward-compatibility, + AC_HELP_STRING([--enable-pmix-backward-compatibility], + [enable PMIx support for PMI-1 and PMI-2 (default: enabled)])) +if test "$enable_pmix_backward_compatibility" = "no"; then + AC_MSG_RESULT([no]) + WANT_PMIX_BACKWARD=0 +else + AC_MSG_RESULT([yes]) + WANT_PMIX_BACKWARD=1 +fi + AM_CONDITIONAL([WANT_INSTALL_HEADERS], [test $WANT_INSTALL_HEADERS -eq 1]) ])dnl @@ -994,6 +1009,7 @@ AC_DEFUN([PMIX_DO_AM_CONDITIONALS],[ AM_CONDITIONAL([WANT_DSTORE], [test "x$enable_dstore" != "xno"]) AM_CONDITIONAL([WANT_PRIMARY_HEADERS], [test "x$pmix_install_primary_headers" = "xyes"]) AM_CONDITIONAL(WANT_INSTALL_HEADERS, test "$WANT_INSTALL_HEADERS" = 1) + AM_CONDITIONAL(WANT_PMIX_BACKWARD, test "$WANT_PMIX_BACKWARD" = 1) ]) pmix_did_am_conditionals=yes ])dnl diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix_functions.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix_functions.m4 index 9f7ecb9d95d..84c04741f6a 100644 --- a/opal/mca/pmix/pmix2x/pmix/config/pmix_functions.m4 +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix_functions.m4 @@ -13,7 +13,9 @@ dnl All rights reserved. dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. dnl Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. dnl Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. -dnl Copyright (c) 2013-2016 Intel, Inc. All rights reserved. +dnl Copyright (c) 2013-2017 Intel, Inc. All rights reserved. +dnl Copyright (c) 2017 Research Organization for Information Science +dnl and Technology (RIST). All rights reserved. dnl dnl $COPYRIGHT$ dnl @@ -278,7 +280,7 @@ for val in ${$1}; do # http://www.open-mpi.org/community/lists/devel/2012/08/11362.php). case $val in - -Xclang) + -Xclang|-Xg) pmix_found=0 pmix_i=`expr $pmix_count + 1` ;; @@ -366,7 +368,7 @@ AC_DEFUN([PMIX_FLAGS_UNIQ],[ # https://github.com/open-mpi/ompi/issues/324). case $val in - -Xclang) + -Xclang|-Xg) pmix_found=0 pmix_i=`expr $pmix_count + 1` ;; diff --git a/opal/mca/pmix/pmix2x/pmix/examples/Makefile.am b/opal/mca/pmix/pmix2x/pmix/examples/Makefile.am index b315c662fd2..07ae0061570 100644 --- a/opal/mca/pmix/pmix2x/pmix/examples/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/examples/Makefile.am @@ -21,7 +21,7 @@ AM_CPPFLAGS = -I$(top_builddir)/src -I$(top_builddir)/src/include -I$(top_builddir)/include -I$(top_builddir)/include/pmix -noinst_PROGRAMS = client dmodex dynamic fault pub tool debugger debuggerd alloc +noinst_PROGRAMS = client dmodex dynamic fault pub tool debugger debuggerd alloc jctrl if !WANT_HIDDEN # these examples use internal symbols # use --disable-visibility @@ -40,11 +40,14 @@ debuggerd_SOURCES = debuggerd.c debuggerd_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) debuggerd_LDADD = $(top_builddir)/src/libpmix.la - alloc_SOURCES = alloc.c alloc_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) alloc_LDADD = $(top_builddir)/src/libpmix.la +jctrl_SOURCES = jctrl.c +jctrl_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) +jctrl_LDADD = $(top_builddir)/src/libpmix.la + dmodex_SOURCES = dmodex.c dmodex_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) dmodex_LDADD = $(top_builddir)/src/libpmix.la diff --git a/opal/mca/pmix/pmix2x/pmix/examples/debugger.c b/opal/mca/pmix/pmix2x/pmix/examples/debugger.c index 1887c16f22f..62bc8e593f2 100644 --- a/opal/mca/pmix/pmix2x/pmix/examples/debugger.c +++ b/opal/mca/pmix/pmix2x/pmix/examples/debugger.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved. * $COPYRIGHT$ * @@ -430,7 +430,6 @@ static void infocbfunc(pmix_status_t status, static int attach_to_running_job(char *nspace) { pmix_status_t rc; - pmix_proc_t myproc; pmix_query_t *query; size_t nq; mydbug_query_t *q; diff --git a/opal/mca/pmix/pmix2x/pmix/examples/jctrl.c b/opal/mca/pmix/pmix2x/pmix/examples/jctrl.c new file mode 100644 index 00000000000..5c1c1d1f73d --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/examples/jctrl.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include + +static pmix_proc_t myproc; + +/* this is the event notification function we pass down below + * when registering for general events - i.e.,, the default + * handler. We don't technically need to register one, but it + * is usually good practice to catch any events that occur */ +static void notification_fn(size_t evhdlr_registration_id, + pmix_status_t status, + const pmix_proc_t *source, + pmix_info_t info[], size_t ninfo, + pmix_info_t results[], size_t nresults, + pmix_event_notification_cbfunc_fn_t cbfunc, + void *cbdata) +{ + if (NULL != cbfunc) { + cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata); + } +} + +/* event handler registration is done asynchronously because it + * may involve the PMIx server registering with the host RM for + * external events. So we provide a callback function that returns + * the status of the request (success or an error), plus a numerical index + * to the registered event. The index is used later on to deregister + * an event handler - if we don't explicitly deregister it, then the + * PMIx server will do so when it see us exit */ +static void evhandler_reg_callbk(pmix_status_t status, + size_t evhandler_ref, + void *cbdata) +{ + volatile int *active = (volatile int*)cbdata; + + if (PMIX_SUCCESS != status) { + fprintf(stderr, "Client %s:%d EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n", + myproc.nspace, myproc.rank, status, (unsigned long)evhandler_ref); + } + *active = status; +} + +static void infocbfunc(pmix_status_t status, + pmix_info_t *info, size_t ninfo, + void *cbdata, + pmix_release_cbfunc_t release_fn, + void *release_cbdata) +{ + volatile int *active = (volatile int*)cbdata; + + /* release the caller */ + if (NULL != release_fn) { + release_fn(release_cbdata); + } + + *active = status; +} + +int main(int argc, char **argv) +{ + int rc; + pmix_value_t value; + pmix_value_t *val = &value; + pmix_proc_t proc; + uint32_t nprocs, n; + pmix_info_t *info, *iptr; + bool flag; + volatile int active; + pmix_data_array_t *dptr; + + /* init us - note that the call to "init" includes the return of + * any job-related info provided by the RM. */ + if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank, rc); + exit(0); + } + fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank); + + + /* register our default event handler - again, this isn't strictly + * required, but is generally good practice */ + active = -1; + PMIx_Register_event_handler(NULL, 0, NULL, 0, + notification_fn, evhandler_reg_callbk, (void*)&active); + while (-1 == active) { + sleep(1); + } + if (0 != active) { + fprintf(stderr, "[%s:%d] Default handler registration failed\n", myproc.nspace, myproc.rank); + exit(active); + } + + /* job-related info is found in our nspace, assigned to the + * wildcard rank as it doesn't relate to a specific rank. Setup + * a name to retrieve such values */ + PMIX_PROC_CONSTRUCT(&proc); + (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); + proc.rank = PMIX_RANK_WILDCARD; + + /* get our universe size */ + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc); + goto done; + } + nprocs = val->data.uint32; + PMIX_VALUE_RELEASE(val); + fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs); + + /* inform the RM that we are preemptible, and that our checkpoint methods are + * "signal" on SIGUSR2 and event on PMIX_JCTRL_CHECKPOINT */ + PMIX_INFO_CREATE(info, 2); + flag = true; + PMIX_INFO_LOAD(&info[0], PMIX_JOB_CTRL_PREEMPTIBLE, (void*)&flag, PMIX_BOOL); + /* can't use "load" to load a pmix_data_array_t */ + (void)strncpy(info[1].key, PMIX_JOB_CTRL_CHECKPOINT_METHOD, PMIX_MAX_KEYLEN); + info[1].value.type = PMIX_DATA_ARRAY; + dptr = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t)); + info[1].value.data.darray = dptr; + dptr->type = PMIX_INFO; + dptr->size = 2; + PMIX_INFO_CREATE(dptr->array, dptr->size); + rc = SIGUSR2; + iptr = (pmix_info_t*)dptr->array; + PMIX_INFO_LOAD(&iptr[0], PMIX_JOB_CTRL_CHECKPOINT_SIGNAL, &rc, PMIX_INT); + rc = PMIX_JCTRL_CHECKPOINT; + PMIX_INFO_LOAD(&iptr[1], PMIX_JOB_CTRL_CHECKPOINT_EVENT, &rc, PMIX_STATUS); + + /* since this is informational and not a requested operation, the target parameter + * doesn't mean anything and can be ignored */ + active = -1; + if (PMIX_SUCCESS != (rc = PMIx_Job_control_nb(NULL, 0, info, 2, infocbfunc, (void*)&active))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc); + goto done; + } + while (-1 == active) { + sleep(1); + } + PMIX_INFO_FREE(info, 2); + if (0 != active) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc); + exit(active); + } + + /* now request that this process be monitored using heartbeats */ + PMIX_INFO_CREATE(iptr, 1); + PMIX_INFO_LOAD(&iptr[0], PMIX_MONITOR_HEARTBEAT, NULL, PMIX_POINTER); + + PMIX_INFO_CREATE(info, 3); + PMIX_INFO_LOAD(&info[0], PMIX_MONITOR_ID, "MONITOR1", PMIX_STRING); + n = 5; // require a heartbeat every 5 seconds + PMIX_INFO_LOAD(&info[1], PMIX_MONITOR_HEARTBEAT_TIME, &n, PMIX_UINT32); + n = 2; // two heartbeats can be missed before declaring us "stalled" + PMIX_INFO_LOAD(&info[2], PMIX_MONITOR_HEARTBEAT_DROPS, &n, PMIX_UINT32); + + /* make the request */ + active = -1; + if (PMIX_SUCCESS != (rc = PMIx_Process_monitor_nb(iptr, PMIX_MONITOR_HEARTBEAT_ALERT, + info, 3, infocbfunc, (void*)&active))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc); + goto done; + } + while (-1 == active) { + sleep(1); + } + PMIX_INFO_FREE(iptr, 1); + PMIX_INFO_FREE(info, 3); + if (0 != active) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc); + exit(active); + } + + /* send a heartbeat */ + PMIx_Heartbeat(); + + /* call fence to synchronize with our peers - no need to + * collect any info as we didn't "put" anything */ + PMIX_INFO_CREATE(info, 1); + flag = false; + PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL); + if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, 1))) { + fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc); + goto done; + } + PMIX_INFO_FREE(info, 1); + + + done: + /* finalize us */ + fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank); + if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) { + fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc); + } else { + fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank); + } + fflush(stderr); + return(0); +} diff --git a/opal/mca/pmix/pmix2x/pmix/include/Makefile.am b/opal/mca/pmix/pmix2x/pmix/include/Makefile.am index 7a59803e514..52ad624c512 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/include/Makefile.am @@ -15,10 +15,14 @@ include_HEADERS = \ pmix.h \ pmix_common.h \ pmix_server.h \ - pmi.h \ - pmi2.h \ pmix_tool.h +if WANT_PMIX_BACKWARD +include_HEADERS += \ + pmi.h \ + pmi2.h +endif + nodist_include_HEADERS = \ pmix_version.h \ pmix_rename.h diff --git a/opal/mca/pmix/pmix2x/pmix/include/pmix.h b/opal/mca/pmix/pmix2x/pmix/include/pmix.h index df43e348baa..c7190ec2670 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/pmix.h +++ b/opal/mca/pmix/pmix2x/pmix/include/pmix.h @@ -473,6 +473,65 @@ pmix_status_t PMIx_Allocation_request_nb(pmix_alloc_directive_t directive, pmix_info_t *info, size_t ninfo, pmix_info_cbfunc_t cbfunc, void *cbdata); +/* Request a job control action. The targets array identifies the + * processes to which the requested job control action is to be applied. + * A NULL value can be used to indicate all processes in the caller's + * nspace. The use of PMIX_RANK_WILDARD can also be used to indicate + * that all processes in the given nspace are to be included. + * + * The directives are provided as pmix_info_t structs in the directives + * array. The callback function provides a status to indicate whether or + * not the request was granted, and to provide some information as to + * the reason for any denial in the pmix_info_cbfunc_t array of pmix_info_t + * structures. If non-NULL, then the specified release_fn must be called + * when the callback function completes - this will be used to release + * any provided pmix_info_t array. + */ +pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata); + +/* Request that something be monitored - e.g., that the server monitor + * this process for periodic heartbeats as an indication that the process + * has not become "wedged". When a monitor detects the specified alarm + * condition, it will generate an event notification using the provided + * error code and passing along any available relevant information. It is + * up to the caller to register a corresponding event handler. + * + * Params: + * + * monitor: attribute indicating the type of monitor being requested - e.g., + * PMIX_MONITOR_FILE to indicate that the requestor is asking that + * a file be monitored. + * + * error: the status code to be used when generating an event notification + * alerting that the monitor has been triggered. The range of the + * notification defaults to PMIX_RANGE_NAMESPACE - this can be + * changed by providing a PMIX_RANGE directive + * + * directives: characterize the monitoring request (e.g., monitor file size) + * and frequency of checking to be done + * + * cbfunc: provides a status to indicate whether or not the request was granted, + * and to provide some information as to the reason for any denial in + * the pmix_info_cbfunc_t array of pmix_info_t structures. + * + * Note: a process can send a heartbeat to the server using the PMIx_Heartbeat + * macro provided below*/ +pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata); + +/* define a special macro to simplify sending of a heartbeat */ +#define PMIx_Heartbeat() \ + do { \ + pmix_info_t _in; \ + PMIX_INFO_CONSTRUCT(&_in); \ + PMIX_INFO_LOAD(&_in, PMIX_SEND_HEARTBEAT, NULL, PMIX_POINTER); \ + PMIx_Process_monitor_nb(&_in, PMIX_SUCCESS, NULL, 0, NULL, NULL); \ + PMIX_INFO_DESTRUCT(&_in); \ + } while(0) + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h index 82c4ce59cc9..7bc9a8ce89a 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h +++ b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h @@ -123,6 +123,8 @@ typedef uint32_t pmix_rank_t; // a local system-level PMIx server #define PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first #define PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data +#define PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server + /* identification attributes */ #define PMIX_USERID "pmix.euid" // (uint32_t) effective user id @@ -218,8 +220,9 @@ typedef uint32_t pmix_rank_t; #define PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective #define PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory #define PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job -#define PMIX_RANGE "pmix.range" // (int) pmix_data_range_t value for calls to publish/lookup/unpublish -#define PMIX_PERSISTENCE "pmix.persist" // (int) pmix_persistence_t value for calls to publish +#define PMIX_RANGE "pmix.range" // (pmix_data_range_t) value for calls to publish/lookup/unpublish or for + // monitoring event notifications +#define PMIX_PERSISTENCE "pmix.persist" // (pmix_persistence_t) value for calls to publish #define PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do // not request data from the server if not found #define PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the @@ -242,10 +245,17 @@ typedef uint32_t pmix_rank_t; #define PMIX_EVENT_HDLR_NAME "pmix.evname" // (char*) string name identifying this handler #define PMIX_EVENT_JOB_LEVEL "pmix.evjob" // (bool) register for job-specific events only #define PMIX_EVENT_ENVIRO_LEVEL "pmix.evenv" // (bool) register for environment events only -#define PMIX_EVENT_ORDER_PREPEND "pmix.evprepend" // (bool) prepend this handler to the precedence list -#define PMIX_EVENT_CUSTOM_RANGE "pmix.evrange" // (pmix_proc_t*) array of pmix_proc_t defining range of event notification +#define PMIX_EVENT_HDLR_FIRST "pmix.evfirst" // (bool) invoke this event handler before any other handlers +#define PMIX_EVENT_HDLR_LAST "pmix.evlast" // (bool) invoke this event handler after all other handlers have been called +#define PMIX_EVENT_HDLR_FIRST_IN_CATEGORY "pmix.evfirstcat" // (bool) invoke this event handler before any other handlers in this category +#define PMIX_EVENT_HDLR_LAST_IN_CATEGORY "pmix.evlastcat" // (bool) invoke this event handler after all other handlers in this category have been called +#define PMIX_EVENT_HDLR_BEFORE "pmix.evbefore" // (char*) put this event handler immediately before the one specified in the (char*) value +#define PMIX_EVENT_HDLR_AFTER "pmix.evafter" // (char*) put this event handler immediately after the one specified in the (char*) value +#define PMIX_EVENT_HDLR_PREPEND "pmix.evprepend" // (bool) prepend this handler to the precedence list within its category +#define PMIX_EVENT_HDLR_APPEND "pmix.evappend" // (bool) append this handler to the precedence list within its category +#define PMIX_EVENT_CUSTOM_RANGE "pmix.evrange" // (pmix_data_array_t*) array of pmix_proc_t defining range of event notification #define PMIX_EVENT_AFFECTED_PROC "pmix.evproc" // (pmix_proc_t) single proc that was affected -#define PMIX_EVENT_AFFECTED_PROCS "pmix.evaffected" // (pmix_proc_t*) array of pmix_proc_t defining affected procs +#define PMIX_EVENT_AFFECTED_PROCS "pmix.evaffected" // (pmix_data_array_t*) array of pmix_proc_t defining affected procs #define PMIX_EVENT_NON_DEFAULT "pmix.evnondef" // (bool) event is not to be delivered to default event handlers #define PMIX_EVENT_RETURN_OBJECT "pmix.evobject" // (void*) object to be returned whenever the registered cbfunc is invoked // NOTE: the object will _only_ be returned to the process that @@ -257,68 +267,78 @@ typedef uint32_t pmix_rank_t; #define PMIX_EVENT_TERMINATE_NODE "pmix.evterm.node" // (bool) RM intends to terminate all procs on this node #define PMIX_EVENT_TERMINATE_PROC "pmix.evterm.proc" // (bool) RM intends to terminate just this process #define PMIX_EVENT_ACTION_TIMEOUT "pmix.evtimeout" // (int) time in sec before RM will execute error response +#define PMIX_EVENT_NO_TERMINATION "pmix.evnoterm" // (bool) indicates that the handler has satisfactorily handled + // the event and believes termination of the application is not required +#define PMIX_EVENT_WANT_TERMINATION "pmix.evterm" // (bool) indicates that the handler has determined that the application should be terminated + /* attributes used to describe "spawn" attributes */ -#define PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use -#define PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs -#define PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs -#define PMIX_ADD_HOST "pmix.addhost" // (char*) comma-delimited list of hosts to add to allocation -#define PMIX_ADD_HOSTFILE "pmix.addhostfile" // (char*) hostfile to add to existing allocation -#define PMIX_PREFIX "pmix.prefix" // (char*) prefix to use for starting spawned procs -#define PMIX_WDIR "pmix.wdir" // (char*) working directory for spawned procs -#define PMIX_MAPPER "pmix.mapper" // (char*) mapper to use for placing spawned procs -#define PMIX_DISPLAY_MAP "pmix.dispmap" // (bool) display process map upon spawn -#define PMIX_PPR "pmix.ppr" // (char*) #procs to spawn on each identified resource -#define PMIX_MAPBY "pmix.mapby" // (char*) mapping policy -#define PMIX_RANKBY "pmix.rankby" // (char*) ranking policy -#define PMIX_BINDTO "pmix.bindto" // (char*) binding policy -#define PMIX_PRELOAD_BIN "pmix.preloadbin" // (bool) preload binaries -#define PMIX_PRELOAD_FILES "pmix.preloadfiles" // (char*) comma-delimited list of files to pre-position -#define PMIX_NON_PMI "pmix.nonpmi" // (bool) spawned procs will not call PMIx_Init -#define PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin -#define PMIX_FWD_STDIN "pmix.fwd.stdin" // (bool) forward my stdin to the designated proc -#define PMIX_FWD_STDOUT "pmix.fwd.stdout" // (bool) forward stdout from spawned procs to me -#define PMIX_FWD_STDERR "pmix.fwd.stderr" // (bool) forward stderr from spawned procs to me -#define PMIX_DEBUGGER_DAEMONS "pmix.debugger" // (bool) spawned app consists of debugger daemons -#define PMIX_COSPAWN_APP "pmix.cospawn" // (bool) designated app is to be spawned as a disconnected - // job - i.e., not part of the "comm_world" of the job +#define PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use +#define PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs +#define PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs +#define PMIX_ADD_HOST "pmix.addhost" // (char*) comma-delimited list of hosts to add to allocation +#define PMIX_ADD_HOSTFILE "pmix.addhostfile" // (char*) hostfile to add to existing allocation +#define PMIX_PREFIX "pmix.prefix" // (char*) prefix to use for starting spawned procs +#define PMIX_WDIR "pmix.wdir" // (char*) working directory for spawned procs +#define PMIX_MAPPER "pmix.mapper" // (char*) mapper to use for placing spawned procs +#define PMIX_DISPLAY_MAP "pmix.dispmap" // (bool) display process map upon spawn +#define PMIX_PPR "pmix.ppr" // (char*) #procs to spawn on each identified resource +#define PMIX_MAPBY "pmix.mapby" // (char*) mapping policy +#define PMIX_RANKBY "pmix.rankby" // (char*) ranking policy +#define PMIX_BINDTO "pmix.bindto" // (char*) binding policy +#define PMIX_PRELOAD_BIN "pmix.preloadbin" // (bool) preload binaries +#define PMIX_PRELOAD_FILES "pmix.preloadfiles" // (char*) comma-delimited list of files to pre-position +#define PMIX_NON_PMI "pmix.nonpmi" // (bool) spawned procs will not call PMIx_Init +#define PMIX_STDIN_TGT "pmix.stdin" // (uint32_t) spawned proc rank that is to receive stdin +#define PMIX_FWD_STDIN "pmix.fwd.stdin" // (bool) forward my stdin to the designated proc +#define PMIX_FWD_STDOUT "pmix.fwd.stdout" // (bool) forward stdout from spawned procs to me +#define PMIX_FWD_STDERR "pmix.fwd.stderr" // (bool) forward stderr from spawned procs to me +#define PMIX_DEBUGGER_DAEMONS "pmix.debugger" // (bool) spawned app consists of debugger daemons +#define PMIX_COSPAWN_APP "pmix.cospawn" // (bool) designated app is to be spawned as a disconnected + // job - i.e., not part of the "comm_world" of the job /* query attributes */ -#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces -#define PMIX_QUERY_JOB_STATUS "pmix.qry.jst" // (pmix_status_t) status of a specified currently executing job -#define PMIX_QUERY_QUEUE_LIST "pmix.qry.qlst" // (char*) request a comma-delimited list of scheduler queues -#define PMIX_QUERY_QUEUE_STATUS "pmix.qry.qst" // (TBD) status of a specified scheduler queue -#define PMIX_QUERY_PROC_TABLE "pmix.qry.ptable" // (char*) input nspace of job whose info is being requested - // returns (pmix_data_array_t) an array of pmix_proc_info_t -#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested - // returns (pmix_data_array_t) an array of pmix_proc_info_t for - // procs in job on same node -#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform -#define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // return a comma-delimited list of supported spawn attributes -#define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // return a comma-delimited list of supported debug attributes -#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // return info on memory usage for the procs indicated in the qualifiers -#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only -#define PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // report average values -#define PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // report minimum and maximum value -#define PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status - // is being requested +#define PMIX_QUERY_NAMESPACES "pmix.qry.ns" // (char*) request a comma-delimited list of active nspaces +#define PMIX_QUERY_JOB_STATUS "pmix.qry.jst" // (pmix_status_t) status of a specified currently executing job +#define PMIX_QUERY_QUEUE_LIST "pmix.qry.qlst" // (char*) request a comma-delimited list of scheduler queues +#define PMIX_QUERY_QUEUE_STATUS "pmix.qry.qst" // (TBD) status of a specified scheduler queue +#define PMIX_QUERY_PROC_TABLE "pmix.qry.ptable" // (char*) input nspace of job whose info is being requested + // returns (pmix_data_array_t) an array of pmix_proc_info_t +#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested + // returns (pmix_data_array_t) an array of pmix_proc_info_t for + // procs in job on same node +#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // (bool) return operations tool is authorized to perform +#define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // (bool) return a comma-delimited list of supported spawn attributes +#define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // (bool) return a comma-delimited list of supported debug attributes +#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // (bool) return info on memory usage for the procs indicated in the qualifiers +#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // (bool) constrain the query to local information only +#define PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // (bool) report average values +#define PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // (bool) report minimum and maximum value +#define PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status + // is being requested +#define PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation + // for the specified nspace /* log attributes */ -#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr -#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout -#define PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless -#define PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere +#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr +#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout +#define PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless +#define PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere +#define PMIX_LOG_EMAIL "pmix.log.email" // (pmix_data_array_t) log via email based on pmix_info_t containing directives +#define PMIX_LOG_EMAIL_ADDR "pmix.log.emaddr" // (char*) comma-delimited list of email addresses that are to recv msg +#define PMIX_LOG_EMAIL_SUBJECT "pmix.log.emsub" // (char*) subject line for email +#define PMIX_LOG_EMAIL_MSG "pmix.log.emmsg" // (char*) msg to be included in email /* debugger attributes */ -#define PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start -#define PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init -#define PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification -#define PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are -#define PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release +#define PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start +#define PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init +#define PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification +#define PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are +#define PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release /* Resource Manager identification */ -#define PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager -#define PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string +#define PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager +#define PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string /* attributes for setting envars */ #define PMIX_SET_ENVAR "pmix.set.envar" // (char*) string "key=value" value shall be put into the environment @@ -327,7 +347,6 @@ typedef uint32_t pmix_rank_t; /* attributes relating to allocations */ #define PMIX_ALLOC_ID "pmix.alloc.id" // (char*) provide a string identifier for this allocation request // which can later be used to query status of the request -#define PMIX_TIME_REMAINING "pmix.time.remaining" // (uint32_t) get number of seconds remaining in allocation #define PMIX_ALLOC_NUM_NODES "pmix.alloc.nnodes" // (uint64_t) number of nodes #define PMIX_ALLOC_NODE_LIST "pmix.alloc.nlist" // (char*) regex of specific nodes #define PMIX_ALLOC_NUM_CPUS "pmix.alloc.ncpus" // (uint64_t) number of cpus @@ -343,6 +362,44 @@ typedef uint32_t pmix_rank_t; #define PMIX_ALLOC_NETWORK_QOS "pmix.alloc.netqos" // (char*) quality of service level #define PMIX_ALLOC_TIME "pmix.alloc.time" // (uint32_t) time in seconds +/* job control attributes */ +#define PMIX_JOB_CTRL_ID "pmix.jctrl.id" // (char*) provide a string identifier for this request +#define PMIX_JOB_CTRL_PAUSE "pmix.jctrl.pause" // (bool) pause the specified processes +#define PMIX_JOB_CTRL_RESUME "pmix.jctrl.resume" // (bool) "un-pause" the specified processes +#define PMIX_JOB_CTRL_CANCEL "pmix.jctrl.cancel" // (char*) cancel the specified request + // (NULL => cancel all requests from this requestor) +#define PMIX_JOB_CTRL_KILL "pmix.jctrl.kill" // (bool) forcibly terminate the specified processes and cleanup +#define PMIX_JOB_CTRL_RESTART "pmix.jctrl.restart" // (char*) restart the specified processes using the given checkpoint ID +#define PMIX_JOB_CTRL_CHECKPOINT "pmix.jctrl.ckpt" // (char*) checkpoint the specified processes and assign the given ID to it +#define PMIX_JOB_CTRL_CHECKPOINT_EVENT "pmix.jctrl.ckptev" // (bool) use event notification to trigger process checkpoint +#define PMIX_JOB_CTRL_CHECKPOINT_SIGNAL "pmix.jctrl.ckptsig" // (int) use the given signal to trigger process checkpoint +#define PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT "pmix.jctrl.ckptsig" // (int) time in seconds to wait for checkpoint to complete +#define PMIX_JOB_CTRL_CHECKPOINT_METHOD "pmix.jctrl.ckmethod" // (pmix_data_array_t) array of pmix_info_t declaring each + // method and value supported by this application +#define PMIX_JOB_CTRL_SIGNAL "pmix.jctrl.sig" // (int) send given signal to specified processes +#define PMIX_JOB_CTRL_PROVISION "pmix.jctrl.pvn" // (char*) regex identifying nodes that are to be provisioned +#define PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned +#define PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted + +/* monitoring attributes */ +#define PMIX_MONITOR_ID "pmix.monitor.id" // (char*) provide a string identifier for this request +#define PMIX_MONITOR_CANCEL "pmix.monitor.cancel" // (char*) identifier to be canceled (NULL = cancel all + // monitoring for this process) +#define PMIX_MONITOR_APP_CONTROL "pmix.monitor.appctrl" // (bool) the application desires to control the response to + // a monitoring event +#define PMIX_MONITOR_HEARTBEAT "pmix.monitor.mbeat" // (void) register to have the server monitor the requestor for heartbeats +#define PMIX_SEND_HEARTBEAT "pmix.monitor.beat" // (void) send heartbeat to local server +#define PMIX_MONITOR_HEARTBEAT_TIME "pmix.monitor.btime" // (uint32_t) time in seconds before declaring heartbeat missed +#define PMIX_MONITOR_HEARTBEAT_DROPS "pmix.monitor.bdrop" // (uint32_t) number of heartbeats that can be missed before + // generating the event +#define PMIX_MONITOR_FILE "pmix.monitor.fmon" // (char*) register to monitor file for signs of life +#define PMIX_MONITOR_FILE_SIZE "pmix.monitor.fsize" // (bool) monitor size of given file is growing to determine app is running +#define PMIX_MONITOR_FILE_ACCESS "pmix.monitor.faccess" // (char*) monitor time since last access of given file to determine app is running +#define PMIX_MONITOR_FILE_MODIFY "pmix.monitor.fmod" // (char*) monitor time since last modified of given file to determine app is running +#define PMIX_MONITOR_FILE_CHECK_TIME "pmix.monitor.ftime" // (uint32_t) time in seconds between checking file +#define PMIX_MONITOR_FILE_DROPS "pmix.monitor.fdrop" // (uint32_t) number of file checks that can be missed before + // generating the event + /**** PROCESS STATE DEFINITIONS ****/ typedef uint8_t pmix_proc_state_t; #define PMIX_PROC_STATE_UNDEF 0 /* undefined process state */ @@ -450,12 +507,21 @@ typedef int pmix_status_t; #define PMIX_ERR_V2X_BASE -100 /* v2.x communication errors */ -#define PMIX_ERR_LOST_CONNECTION_TO_SERVER (PMIX_ERR_V2X_BASE - 1) -#define PMIX_ERR_LOST_PEER_CONNECTION (PMIX_ERR_V2X_BASE - 2) -#define PMIX_ERR_LOST_CONNECTION_TO_CLIENT (PMIX_ERR_V2X_BASE - 3) +#define PMIX_ERR_LOST_CONNECTION_TO_SERVER (PMIX_ERR_V2X_BASE - 1) +#define PMIX_ERR_LOST_PEER_CONNECTION (PMIX_ERR_V2X_BASE - 2) +#define PMIX_ERR_LOST_CONNECTION_TO_CLIENT (PMIX_ERR_V2X_BASE - 3) /* used by the query system */ -#define PMIX_QUERY_PARTIAL_SUCCESS (PMIX_ERR_V2X_BASE - 4) -#define PMIX_NOTIFY_ALLOC_COMPLETE (PMIX_ERR_V2X_BASE - 5) +#define PMIX_QUERY_PARTIAL_SUCCESS (PMIX_ERR_V2X_BASE - 4) +/* request responses */ +#define PMIX_NOTIFY_ALLOC_COMPLETE (PMIX_ERR_V2X_BASE - 5) +/* job control */ +#define PMIX_JCTRL_CHECKPOINT (PMIX_ERR_V2X_BASE - 6) // monitored by client to trigger checkpoint operation +#define PMIX_JCTRL_CHECKPOINT_COMPLETE (PMIX_ERR_V2X_BASE - 7) // sent by client and monitored by server to notify that requested + // checkpoint operation has completed +#define PMIX_JCTRL_PREEMPT_ALERT (PMIX_ERR_V2X_BASE - 8) // monitored by client to detect RM intends to preempt +/* monitoring */ +#define PMIX_MONITOR_HEARTBEAT_ALERT (PMIX_ERR_V2X_BASE - 9) +#define PMIX_MONITOR_FILE_ALERT (PMIX_ERR_V2X_BASE - 10) /* define a starting point for operational error constants so * we avoid renumbering when making additions */ @@ -580,6 +646,7 @@ typedef uint8_t pmix_data_range_t; #define PMIX_RANGE_SESSION 4 // data available to all procs in session #define PMIX_RANGE_GLOBAL 5 // data available to all procs #define PMIX_RANGE_CUSTOM 6 // range is specified in a pmix_info_t +#define PMIX_RANGE_PROC_LOCAL 7 // restrict range to the local proc /* define a "persistence" policy for data published by clients */ typedef uint8_t pmix_persistence_t; diff --git a/opal/mca/pmix/pmix2x/pmix/include/pmix_server.h b/opal/mca/pmix/pmix2x/pmix/include/pmix_server.h index 531bc173d9d..55a66041e3d 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/pmix_server.h +++ b/opal/mca/pmix/pmix2x/pmix/include/pmix_server.h @@ -328,6 +328,18 @@ typedef pmix_status_t (*pmix_server_alloc_fn_t)(const pmix_proc_t *client, const pmix_info_t data[], size_t ndata, pmix_info_cbfunc_t cbfunc, void *cbdata); +/* Execute a job control action on behalf of a client */ +typedef pmix_status_t (*pmix_server_job_control_fn_t)(const pmix_proc_t *requestor, + const pmix_proc_t targets[], size_t ntargets, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata); + +/* Request that a client be monitored for activity */ +typedef pmix_status_t (*pmix_server_monitor_fn_t)(const pmix_proc_t *requestor, + const pmix_info_t *monitor, pmix_status_t error, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata); + typedef struct pmix_server_module_2_0_0_t { /* v1x interfaces */ pmix_server_client_connected_fn_t client_connected; @@ -350,12 +362,14 @@ typedef struct pmix_server_module_2_0_0_t { pmix_server_tool_connection_fn_t tool_connected; pmix_server_log_fn_t log; pmix_server_alloc_fn_t allocate; + pmix_server_job_control_fn_t job_control; + pmix_server_monitor_fn_t monitor; } pmix_server_module_t; /**** SERVER SUPPORT INIT/FINALIZE FUNCTIONS ****/ /* Initialize the server support library, and provide a - * pointer to a pmix_server_module_t structure + * pointer to a pmix_server_module_t structure * containing the caller's callback functions. The * array of pmix_info_t structs is used to pass * additional info that may be required by the server diff --git a/opal/mca/pmix/pmix2x/pmix/src/Makefile.am b/opal/mca/pmix/pmix2x/pmix/src/Makefile.am index 97ea7b3de9c..e70a8a39d58 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/src/Makefile.am @@ -11,7 +11,7 @@ # All rights reserved. # Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. -# Copyright (c) 2013-2016 Intel, Inc. All rights reserved +# Copyright (c) 2013-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -49,10 +49,22 @@ libpmix_la_LIBADD = \ libpmix_la_DEPENDENCIES = $(libpmix_la_LIBADD) if PMIX_EMBEDDED_MODE + +if WANT_INSTALL_HEADERS + +# retain output of pmix library +lib_LTLIBRARIES = libpmix.la +libpmix_la_SOURCES = $(headers) $(sources) +libpmix_la_LDFLAGS = -version-info $(libpmix_so_version) + +else + noinst_LTLIBRARIES = libpmix.la libpmix_la_SOURCES = $(headers) $(sources) libpmix_la_LDFLAGS = +endif + else lib_LTLIBRARIES = libpmix.la diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/client/Makefile.include index 2f4fd6eeb19..e9abb45ff19 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/Makefile.include +++ b/opal/mca/pmix/pmix2x/pmix/src/client/Makefile.include @@ -1,6 +1,6 @@ # -*- makefile -*- # -# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. # Copyright (c) 2014 Artem Y. Polyakov . # All rights reserved. # Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. @@ -22,7 +22,7 @@ sources += \ client/pmix_client_spawn.c \ client/pmix_client_connect.c -if !PMIX_EMBEDDED_MODE +if WANT_PMIX_BACKWARD sources += \ client/pmi1.c \ client/pmi2.c diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c index 034b4813c33..6abfb3fac89 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c +++ b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c @@ -633,8 +633,8 @@ static void _getnbfn(int fd, short flags, void *cbdata) rc = pmix_hash_fetch(&nptr->modex, pmix_globals.myid.rank, cb->key, &val); if( PMIX_SUCCESS != rc ){ rc = PMIX_ERR_NOT_FOUND; - goto respond; } + goto respond; } /* otherwise, the data must be something they "put" */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/common/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/common/Makefile.include index 4f29509b0f1..6a566f58a4b 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/common/Makefile.include +++ b/opal/mca/pmix/pmix2x/pmix/src/common/Makefile.include @@ -1,6 +1,6 @@ # -*- makefile -*- # -# Copyright (c) 2015 Intel, Inc. All rights reserved. +# Copyright (c) 2015-2017 Intel, Inc. All rights reserved. # Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. # $COPYRIGHT$ # @@ -13,4 +13,5 @@ sources += \ common/pmix_query.c \ common/pmix_strings.c \ common/pmix_log.c \ - common/pmix_jobdata.c + common/pmix_jobdata.c \ + common/pmix_control.c diff --git a/opal/mca/pmix/pmix2x/pmix/src/common/pmix_control.c b/opal/mca/pmix/pmix2x/pmix/src/common/pmix_control.c new file mode 100644 index 00000000000..b0f614b582b --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/common/pmix_control.c @@ -0,0 +1,276 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2016 Mellanox Technologies, Inc. + * All rights reserved. + * Copyright (c) 2016 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "src/util/argv.h" +#include "src/util/error.h" +#include "src/util/output.h" +#include "src/buffer_ops/buffer_ops.h" +#include "src/mca/ptl/ptl.h" + +#include "src/client/pmix_client_ops.h" +#include "src/server/pmix_server_ops.h" +#include "src/include/pmix_globals.h" + +static void relcbfunc(void *cbdata) +{ + pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata; + + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix:query release callback"); + + if (NULL != cd->info) { + PMIX_INFO_FREE(cd->info, cd->ninfo); + } + PMIX_RELEASE(cd); +} +static void query_cbfunc(struct pmix_peer_t *peer, + pmix_ptl_hdr_t *hdr, + pmix_buffer_t *buf, void *cbdata) +{ + pmix_query_caddy_t *cd = (pmix_query_caddy_t*)cbdata; + pmix_status_t rc; + pmix_shift_caddy_t *results; + int cnt; + + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix:query cback from server"); + + results = PMIX_NEW(pmix_shift_caddy_t); + + /* unpack the status */ + cnt = 1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &results->status, &cnt, PMIX_STATUS))) { + PMIX_ERROR_LOG(rc); + goto complete; + } + if (PMIX_SUCCESS != results->status) { + goto complete; + } + + /* unpack any returned data */ + cnt = 1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &results->ninfo, &cnt, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + goto complete; + } + if (0 < results->ninfo) { + PMIX_INFO_CREATE(results->info, results->ninfo); + cnt = results->ninfo; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, results->info, &cnt, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + goto complete; + } + } + + complete: + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix:query cback from server releasing"); + /* release the caller */ + if (NULL != cd->cbfunc) { + cd->cbfunc(results->status, results->info, results->ninfo, cd->cbdata, relcbfunc, results); + } + PMIX_RELEASE(cd); +} + +PMIX_EXPORT pmix_status_t PMIx_Job_control_nb(const pmix_proc_t targets[], size_t ntargets, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata) +{ + pmix_buffer_t *msg; + pmix_cmd_t cmd = PMIX_JOB_CONTROL_CMD; + pmix_status_t rc; + pmix_query_caddy_t *cb; + + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix: job control called"); + + if (pmix_globals.init_cntr <= 0) { + return PMIX_ERR_INIT; + } + + /* if we are the server, then we just issue the request and + * return the response */ + if (PMIX_PROC_SERVER == pmix_globals.proc_type) { + if (NULL == pmix_host_server.job_control) { + /* nothing we can do */ + return PMIX_ERR_NOT_SUPPORTED; + } + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix:job_control handed to RM"); + rc = pmix_host_server.job_control(&pmix_globals.myid, + targets, ntargets, + directives, ndirs, + cbfunc, cbdata); + return rc; + } + + /* if we are a client, then relay this request to the server */ + + /* if we aren't connected, don't attempt to send */ + if (!pmix_globals.connected) { + return PMIX_ERR_UNREACH; + } + + msg = PMIX_NEW(pmix_buffer_t); + /* pack the cmd */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + + /* pack the number of targets */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ntargets, 1, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + /* remember, the targets can be NULL to indicate that the operation + * is to be done against all members of our nspace */ + if (0 < ntargets) { + /* pack the targets */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, targets, ntargets, PMIX_PROC))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + } + + /* pack the directives */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ndirs, 1, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + if (0 < ndirs) { + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, directives, ndirs, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + } + + /* create a callback object as we need to pass it to the + * recv routine so we know which callback to use when + * the return message is recvd */ + cb = PMIX_NEW(pmix_query_caddy_t); + cb->cbfunc = cbfunc; + cb->cbdata = cbdata; + + /* push the message into our event base to send to the server */ + if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, query_cbfunc, (void*)cb))){ + PMIX_RELEASE(msg); + PMIX_RELEASE(cb); + } + + return rc; +} + +PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pmix_status_t error, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata) +{ + pmix_buffer_t *msg; + pmix_cmd_t cmd = PMIX_MONITOR_CMD; + pmix_status_t rc; + pmix_query_caddy_t *cb; + + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix: monitor called"); + + if (pmix_globals.init_cntr <= 0) { + return PMIX_ERR_INIT; + } + + /* if we are the server, then we just issue the request and + * return the response */ + if (PMIX_PROC_SERVER == pmix_globals.proc_type) { + if (NULL == pmix_host_server.monitor) { + /* nothing we can do */ + return PMIX_ERR_NOT_SUPPORTED; + } + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix:monitor handed to RM"); + rc = pmix_host_server.monitor(&pmix_globals.myid, monitor, error, + directives, ndirs, cbfunc, cbdata); + return rc; + } + + /* if we are a client, then relay this request to the server */ + + /* if we aren't connected, don't attempt to send */ + if (!pmix_globals.connected) { + return PMIX_ERR_UNREACH; + } + + msg = PMIX_NEW(pmix_buffer_t); + /* pack the cmd */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + + /* pack the monitor */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, monitor, 1, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + + /* pack the error */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &error, 1, PMIX_STATUS))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + + /* pack the directives */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ndirs, 1, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + if (0 < ndirs) { + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, directives, ndirs, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(msg); + return rc; + } + } + + /* create a callback object as we need to pass it to the + * recv routine so we know which callback to use when + * the return message is recvd */ + cb = PMIX_NEW(pmix_query_caddy_t); + cb->cbfunc = cbfunc; + cb->cbdata = cbdata; + + /* push the message into our event base to send to the server */ + if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, query_cbfunc, (void*)cb))){ + PMIX_RELEASE(msg); + PMIX_RELEASE(cb); + } + + return rc; +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event.h b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event.h index f24078d6b22..e9ebd333181 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event.h +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event.h @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,47 +29,42 @@ BEGIN_C_DECLS -/* define an object for tracking event handlers focused on a - * single status code */ -typedef struct { - pmix_list_item_t super; - char *name; - size_t index; - pmix_status_t code; - pmix_notification_fn_t evhdlr; - void *cbobject; -} pmix_single_event_t; -PMIX_CLASS_DECLARATION(pmix_single_event_t); +#define PMIX_EVENT_ORDER_NONE 0x00 +#define PMIX_EVENT_ORDER_FIRST 0x01 +#define PMIX_EVENT_ORDER_LAST 0x02 +#define PMIX_EVENT_ORDER_BEFORE 0x04 +#define PMIX_EVENT_ORDER_AFTER 0x08 +#define PMIX_EVENT_ORDER_PREPEND 0x10 +#define PMIX_EVENT_ORDER_APPEND 0x20 -/* define an object for tracking event handlers registered - * on multiple status codes, generally corresponding to a - * functional group */ +/* define a struct for tracking registration ranges */ typedef struct { - pmix_list_item_t super; - char *name; - size_t index; - pmix_status_t *codes; - size_t ncodes; - pmix_notification_fn_t evhdlr; - void *cbobject; -} pmix_multi_event_t; -PMIX_CLASS_DECLARATION(pmix_multi_event_t); + pmix_data_range_t range; + pmix_proc_t *procs; + size_t nprocs; +} pmix_range_trkr_t; -/* define an object for tracking default event handlers */ +/* define a common struct for tracking event handlers */ typedef struct { pmix_list_item_t super; char *name; size_t index; + uint8_t precedence; + char *locator; + pmix_range_trkr_t rng; pmix_notification_fn_t evhdlr; void *cbobject; -} pmix_default_event_t; -PMIX_CLASS_DECLARATION(pmix_default_event_t); + pmix_status_t *codes; + size_t ncodes; +} pmix_event_hdlr_t; +PMIX_CLASS_DECLARATION(pmix_event_hdlr_t); /* define an object for tracking status codes we are actively * registered to receive */ typedef struct { pmix_list_item_t super; pmix_status_t code; + size_t nregs; } pmix_active_code_t; PMIX_CLASS_DECLARATION(pmix_active_code_t); @@ -79,6 +74,8 @@ PMIX_CLASS_DECLARATION(pmix_active_code_t); typedef struct { pmix_object_t super; size_t nhdlrs; + pmix_event_hdlr_t *first; + pmix_event_hdlr_t *last; pmix_list_t actives; pmix_list_t single_events; pmix_list_t multi_events; @@ -98,15 +95,14 @@ typedef struct pmix_event_chain_t { pmix_object_t super; pmix_status_t status; bool nondefault; + bool endchain; pmix_proc_t source; pmix_data_range_t range; pmix_info_t *info; size_t ninfo; pmix_info_t *results; size_t nresults; - pmix_single_event_t *sing; - pmix_multi_event_t *multi; - pmix_default_event_t *def; + pmix_event_hdlr_t *evhdlr; pmix_op_cbfunc_t final_cbfunc; void *final_cbdata; } pmix_event_chain_t; diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c index 3eeb5a30b32..83474169fd0 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -29,6 +29,8 @@ static pmix_status_t notify_server_of_event(pmix_status_t status, pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); +static bool check_range(pmix_range_trkr_t *range, const pmix_proc_t *proc); + /* if we are a client, we call this function to notify the server of * an event. If we are a server, our host RM will call this function * to notify us of an event */ @@ -190,47 +192,49 @@ static void progress_local_event_hdlr(pmix_status_t status, void *notification_cbdata) { pmix_event_chain_t *chain = (pmix_event_chain_t*)notification_cbdata; - size_t n, nsave; + size_t n, nsave, cnt; pmix_info_t *newinfo; - pmix_list_item_t *nxt; - pmix_single_event_t *sing; - pmix_multi_event_t *multi; - pmix_default_event_t *def; + pmix_list_item_t *item; + pmix_event_hdlr_t *nxt; - /* if the caller indicates that the chain is completed, then stop here */ - if (PMIX_EVENT_ACTION_COMPLETE == status) { - goto complete; + /* aggregate the results per RFC0018 - first search the + * prior chained results to see if any keys have been NULL'd + * as this indicates that info struct should be removed */ + nsave = 0; + for (n=0; n < chain->nresults; n++) { + if (0 < strlen(chain->results[n].key)) { + ++nsave; + } } + /* we have to at least record the status returned by each + * stage of the event handler chain, so we have to reallocate + * the array to make space */ - /* save the current number of results */ - nsave = chain->nresults; + /* add in any new results plus space for the returned status */ + nsave += nresults + 1; /* create the new space */ - PMIX_INFO_CREATE(newinfo, chain->nresults + nresults + 1); + PMIX_INFO_CREATE(newinfo, nsave); /* transfer over the prior data */ + cnt = 0; for (n=0; n < chain->nresults; n++) { - PMIX_INFO_XFER(&newinfo[n], &chain->results[n]); - } - /* save this handler's response */ - if (NULL != chain->sing) { - if (NULL != chain->sing->name) { - (void)strncpy(newinfo[nsave].key, chain->sing->name, PMIX_MAX_KEYLEN); - } - } else if (NULL != chain->multi) { - if (NULL != chain->multi->name) { - (void)strncpy(newinfo[nsave].key, chain->multi->name, PMIX_MAX_KEYLEN); - } - } else if (NULL != chain->def) { - if (NULL != chain->def->name) { - (void)strncpy(newinfo[nsave].key, chain->def->name, PMIX_MAX_KEYLEN); + if (0 < strlen(chain->results[n].key)) { + PMIX_INFO_XFER(&newinfo[cnt], &chain->results[n]); + ++cnt; } + } + /* save this handler's returned status */ + if (NULL != chain->evhdlr->name) { + (void)strncpy(newinfo[cnt].key, chain->evhdlr->name, PMIX_MAX_KEYLEN); } else { - (void)strncpy(newinfo[nsave].key, "UNKNOWN", PMIX_MAX_KEYLEN); + (void)strncpy(newinfo[cnt].key, "UNKNOWN", PMIX_MAX_KEYLEN); } - newinfo[nsave].value.type = PMIX_STATUS; - newinfo[nsave].value.data.status = status; + newinfo[cnt].value.type = PMIX_STATUS; + newinfo[cnt].value.data.status = status; + ++cnt; /* transfer across the new results */ for (n=0; n < nresults; n++) { - PMIX_INFO_XFER(&newinfo[n+nsave+1], &results[n]); + PMIX_INFO_XFER(&newinfo[cnt], &results[n]); + ++cnt; } /* release the prior results */ if (0 < chain->nresults) { @@ -238,76 +242,139 @@ static void progress_local_event_hdlr(pmix_status_t status, } /* pass along the new ones */ chain->results = newinfo; - chain->nresults = nsave + nresults; + chain->nresults = cnt; + + /* if the caller indicates that the chain is completed, + * or we completed the "last" event, then stop here */ + if (PMIX_EVENT_ACTION_COMPLETE == status || chain->endchain) { + goto complete; + } + item = NULL; /* see if we need to continue, starting with the single code events */ - if (NULL != chain->sing) { + if (1 == chain->evhdlr->ncodes) { /* the last handler was for a single code - see if there are * any others that match this event */ - while (pmix_list_get_end(&pmix_globals.events.single_events) != (nxt = pmix_list_get_next(&chain->sing->super))) { - sing = (pmix_single_event_t*)nxt; - if (sing->code == chain->status) { - chain->sing = sing; + item = &chain->evhdlr->super; + while (pmix_list_get_end(&pmix_globals.events.single_events) != (item = pmix_list_get_next(item))) { + nxt = (pmix_event_hdlr_t*)item; + if (nxt->codes[0] == chain->status && + check_range(&nxt->rng, &chain->source)) { + chain->evhdlr = nxt; /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = sing->cbobject; - sing->evhdlr(sing->index, - chain->status, &chain->source, - chain->info, chain->ninfo, - chain->results, chain->nresults, - progress_local_event_hdlr, (void*)chain); - goto complete; + chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; + nxt->evhdlr(nxt->index, + chain->status, &chain->source, + chain->info, chain->ninfo, + chain->results, chain->nresults, + progress_local_event_hdlr, (void*)chain); + return; } } /* if we get here, then there are no more single code * events that match */ - chain->sing = NULL; - /* pickup the beginning of the multi-code event list */ - chain->multi = (pmix_multi_event_t*)pmix_list_get_begin(&pmix_globals.events.multi_events); + item = pmix_list_get_begin(&pmix_globals.events.multi_events); } /* see if we need to continue with the multi code events */ - if (NULL != chain->multi) { - while (pmix_list_get_end(&pmix_globals.events.multi_events) != (nxt = pmix_list_get_next(&chain->multi->super))) { - multi = (pmix_multi_event_t*)nxt; - for (n=0; n < multi->ncodes; n++) { - if (multi->codes[n] == chain->status) { - /* found it - invoke the handler, pointing its - * callback function to our progression function */ - chain->multi = multi; + if (NULL != chain->evhdlr->codes || NULL != item) { + /* the last handler was for a multi-code event, or we exhausted + * all the single code events */ + if (NULL == item) { + /* if the last handler was multi-code, then start from that point */ + item = &chain->evhdlr->super; + } + while (pmix_list_get_end(&pmix_globals.events.multi_events) != (item = pmix_list_get_next(item))) { + nxt = (pmix_event_hdlr_t*)item; + if (!check_range(&nxt->rng, &chain->source)) { + continue; + } + for (n=0; n < nxt->ncodes; n++) { + /* if this event handler provided a range, check to see if + * the source fits within it */ + if (nxt->codes[n] == chain->status) { + chain->evhdlr = nxt; /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = multi->cbobject; - multi->evhdlr(multi->index, - chain->status, &chain->source, - chain->info, chain->ninfo, - chain->results, chain->nresults, - progress_local_event_hdlr, (void*)chain); - goto complete; + chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; + nxt->evhdlr(nxt->index, + chain->status, &chain->source, + chain->info, chain->ninfo, + chain->results, chain->nresults, + progress_local_event_hdlr, (void*)chain); + return; } } } /* if we get here, then there are no more multi-mode * events that match */ - chain->multi = NULL; - /* pickup the beginning of the default event list */ - chain->def = (pmix_default_event_t*)pmix_list_get_begin(&pmix_globals.events.default_events); + item = pmix_list_get_begin(&pmix_globals.events.default_events); } - /* if they didn't want it to go to a default handler, then we are done */ - if (chain->nondefault) { - goto complete; + /* if they didn't want it to go to a default handler, then ignore them */ + if (!chain->nondefault) { + if (NULL == item) { + item = &chain->evhdlr->super; + } + if (pmix_list_get_end(&pmix_globals.events.default_events) != (item = pmix_list_get_next(item))) { + nxt = (pmix_event_hdlr_t*)item; + /* if this event handler provided a range, check to see if + * the source fits within it */ + if (check_range(&nxt->rng, &chain->source)) { + chain->evhdlr = nxt; + /* add any cbobject - the info struct for it is at the end */ + chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; + nxt->evhdlr(nxt->index, + chain->status, &chain->source, + chain->info, chain->ninfo, + chain->results, chain->nresults, + progress_local_event_hdlr, (void*)chain); + return; + } + } } - if (NULL != chain->def) { - if (pmix_list_get_end(&pmix_globals.events.default_events) != (nxt = pmix_list_get_next(&chain->def->super))) { - def = (pmix_default_event_t*)nxt; - chain->def = def; + /* if we registered a "last" handler, and it fits the given range + * and code, then invoke it now */ + if (NULL != pmix_globals.events.last && + check_range(&pmix_globals.events.last->rng, &chain->source)) { + chain->endchain = true; // ensure we don't do this again + if (1 == pmix_globals.events.last->ncodes && + pmix_globals.events.last->codes[0] == chain->status) { + chain->evhdlr = pmix_globals.events.last; /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = def->cbobject; - def->evhdlr(def->index, - chain->status, &chain->source, - chain->info, chain->ninfo, - chain->results, chain->nresults, - progress_local_event_hdlr, (void*)chain); + chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; + chain->evhdlr->evhdlr(chain->evhdlr->index, + chain->status, &chain->source, + chain->info, chain->ninfo, + chain->results, chain->nresults, + progress_local_event_hdlr, (void*)chain); + return; + } else if (NULL != pmix_globals.events.last->codes) { + /* need to check if this code is included in the array */ + for (n=0; n < pmix_globals.events.last->ncodes; n++) { + if (pmix_globals.events.last->codes[n] == chain->status) { + chain->evhdlr = pmix_globals.events.last; + /* add any cbobject - the info struct for it is at the end */ + chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; + chain->evhdlr->evhdlr(chain->evhdlr->index, + chain->status, &chain->source, + chain->info, chain->ninfo, + chain->results, chain->nresults, + progress_local_event_hdlr, (void*)chain); + return; + } + } + } else { + /* gets run for all codes */ + chain->evhdlr = pmix_globals.events.last; + /* add any cbobject - the info struct for it is at the end */ + chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; + chain->evhdlr->evhdlr(chain->evhdlr->index, + chain->status, &chain->source, + chain->info, chain->ninfo, + chain->results, chain->nresults, + progress_local_event_hdlr, (void*)chain); + return; } } @@ -339,10 +406,9 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain) /* We need to parse thru each registered handler and determine * which one(s) to call for the specific error */ size_t i; - pmix_single_event_t *sing; - pmix_multi_event_t *multi; - pmix_default_event_t *def; + pmix_event_hdlr_t *evhdlr; pmix_status_t rc = PMIX_SUCCESS; + bool found; pmix_output_verbose(2, pmix_globals.debug_output, "%s:%d invoke_local_event_hdlr", @@ -363,45 +429,63 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain) } } + /* if we registered a "first" handler, and it fits the given range, + * then invoke it first */ + if (NULL != pmix_globals.events.first) { + if (1 == pmix_globals.events.first->ncodes && + pmix_globals.events.first->codes[0] == chain->status && + check_range(&pmix_globals.events.first->rng, &chain->source)) { + /* invoke the handler */ + chain->evhdlr = pmix_globals.events.first; + goto invk; + } else if (NULL != pmix_globals.events.first->codes) { + /* need to check if this code is included in the array */ + found = false; + for (i=0; i < pmix_globals.events.first->ncodes; i++) { + if (pmix_globals.events.first->codes[i] == chain->status) { + found = true; + break; + } + } + /* if this event handler provided a range, check to see if + * the source fits within it */ + if (found && check_range(&pmix_globals.events.first->rng, &chain->source)) { + /* invoke the handler */ + chain->evhdlr = pmix_globals.events.first; + goto invk; + } + } else { + /* take all codes for a default handler */ + if (check_range(&pmix_globals.events.first->rng, &chain->source)) { + /* invoke the handler */ + chain->evhdlr = pmix_globals.events.first; + goto invk; + } + } + /* get here if there is no match, so fall thru */ + } + /* cycle thru the single-event registrations first */ - PMIX_LIST_FOREACH(sing, &pmix_globals.events.single_events, pmix_single_event_t) { - if (sing->code == chain->status) { - /* found it - invoke the handler, pointing its - * callback function to our progression function */ - chain->sing = sing; - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = sing->cbobject; - pmix_output_verbose(2, pmix_globals.debug_output, - "[%s:%d] CALLING SINGLE EVHDLR", - pmix_globals.myid.nspace, pmix_globals.myid.rank); - sing->evhdlr(sing->index, - chain->status, &chain->source, - chain->info, chain->ninfo, - NULL, 0, - progress_local_event_hdlr, (void*)chain); - return; + PMIX_LIST_FOREACH(evhdlr, &pmix_globals.events.single_events, pmix_event_hdlr_t) { + if (evhdlr->codes[0] == chain->status) { + if (check_range(&evhdlr->rng, &chain->source)) { + /* invoke the handler */ + chain->evhdlr = evhdlr; + goto invk; + } } } /* if we didn't find any match in the single-event registrations, * then cycle thru the multi-event registrations next */ - PMIX_LIST_FOREACH(multi, &pmix_globals.events.multi_events, pmix_multi_event_t) { - for (i=0; i < multi->ncodes; i++) { - if (multi->codes[i] == chain->status) { - /* found it - invoke the handler, pointing its - * callback function to our progression function */ - chain->multi = multi; - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = multi->cbobject; - pmix_output_verbose(2, pmix_globals.debug_output, - "[%s:%d] CALLING MULTI EVHDLR", - pmix_globals.myid.nspace, pmix_globals.myid.rank); - multi->evhdlr(multi->index, - chain->status, &chain->source, - chain->info, chain->ninfo, - NULL, 0, - progress_local_event_hdlr, (void*)chain); - return; + PMIX_LIST_FOREACH(evhdlr, &pmix_globals.events.multi_events, pmix_event_hdlr_t) { + for (i=0; i < evhdlr->ncodes; i++) { + if (evhdlr->codes[i] == chain->status) { + if (check_range(&evhdlr->rng, &chain->source)) { + /* invoke the handler */ + chain->evhdlr = evhdlr; + goto invk; + } } } } @@ -412,26 +496,33 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain) } /* finally, pass it to any default handlers */ - PMIX_LIST_FOREACH(def, &pmix_globals.events.default_events, pmix_default_event_t) { - chain->def = def; - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = def->cbobject; - pmix_output_verbose(2, pmix_globals.debug_output, - "[%s:%d] CALLING DEFAULT EVHDLR", __FILE__, __LINE__); - def->evhdlr(def->index, - chain->status, &chain->source, - chain->info, chain->ninfo, - NULL, 0, - progress_local_event_hdlr, (void*)chain); - return; + PMIX_LIST_FOREACH(evhdlr, &pmix_globals.events.default_events, pmix_event_hdlr_t) { + if (check_range(&evhdlr->rng, &chain->source)) { + /* invoke the handler */ + chain->evhdlr = evhdlr; + goto invk; + } } - + /* if we got here, then nothing was found */ complete: /* we still have to call their final callback */ if (NULL != chain->final_cbfunc) { chain->final_cbfunc(rc, chain->final_cbdata); } return; + + + invk: + /* invoke the handler */ + chain->info[chain->ninfo-1].value.data.ptr = chain->evhdlr->cbobject; + pmix_output_verbose(2, pmix_globals.debug_output, + "[%s:%d] INVOKING EVHDLR", __FILE__, __LINE__); + chain->evhdlr->evhdlr(chain->evhdlr->index, + chain->status, &chain->source, + chain->info, chain->ninfo, + NULL, 0, + progress_local_event_hdlr, (void*)chain); + return; } @@ -617,66 +708,104 @@ pmix_status_t pmix_server_notify_client_of_event(pmix_status_t status, return PMIX_SUCCESS; } -static void sevcon(pmix_single_event_t *p) -{ - p->name = NULL; - p->evhdlr = NULL; - p->cbobject = NULL; -} -static void sevdes(pmix_single_event_t *p) +static bool check_range(pmix_range_trkr_t *rng, + const pmix_proc_t *proc) { - if (NULL != p->name) { - free(p->name); + size_t n; + + if (PMIX_RANGE_UNDEF == rng->range || + PMIX_RANGE_GLOBAL == rng->range || + PMIX_RANGE_SESSION == rng->range || + PMIX_RANGE_LOCAL == rng->range) { // assume RM took care of session & local for now + return true; } + if (PMIX_RANGE_NAMESPACE == rng->range) { + if (0 == strncmp(pmix_globals.myid.nspace, proc->nspace, PMIX_MAX_NSLEN)) { + return true; + } + return false; + } + if (PMIX_RANGE_PROC_LOCAL == rng->range) { + if (0 == strncmp(pmix_globals.myid.nspace, proc->nspace, PMIX_MAX_NSLEN) && + pmix_globals.myid.rank == proc->rank) { + return true; + } + return false; + } + if (PMIX_RANGE_CUSTOM == rng->range) { + if (NULL != rng->procs) { + /* see if this proc was included */ + for (n=0; n < rng->nprocs; n++) { + if (0 != strncmp(rng->procs[n].nspace, proc->nspace, PMIX_MAX_NSLEN)) { + continue; + } + if (PMIX_RANK_WILDCARD == rng->procs[n].rank || + rng->procs[n].rank == proc->rank) { + return true; + } + } + /* if we get here, then this proc isn't in range */ + return false; + } else { + /* if they didn't give us a list, then assume + * everyone included */ + return true; + } + } + + /* if it is anything else, then reject it */ + return false; } -PMIX_CLASS_INSTANCE(pmix_single_event_t, - pmix_list_item_t, - sevcon, sevdes); -static void mevcon(pmix_multi_event_t *p) + +/**** CLASS INSTANTIATIONS ****/ + +static void sevcon(pmix_event_hdlr_t *p) { p->name = NULL; - p->codes = NULL; - p->ncodes = 0; + p->index = UINT_MAX; + p->precedence = PMIX_EVENT_ORDER_NONE; + p->locator = NULL; + p->rng.range = PMIX_RANGE_UNDEF; + p->rng.procs = NULL; + p->rng.nprocs = 0; p->evhdlr = NULL; p->cbobject = NULL; + p->codes = NULL; + p->ncodes = 0; } -static void mevdes(pmix_multi_event_t *p) +static void sevdes(pmix_event_hdlr_t *p) { if (NULL != p->name) { free(p->name); } + if (NULL != p->locator) { + free(p->locator); + } + if (NULL != p->rng.procs) { + free(p->rng.procs); + } if (NULL != p->codes) { free(p->codes); } } -PMIX_CLASS_INSTANCE(pmix_multi_event_t, +PMIX_CLASS_INSTANCE(pmix_event_hdlr_t, pmix_list_item_t, - mevcon, mevdes); + sevcon, sevdes); -static void devcon(pmix_default_event_t *p) +static void accon(pmix_active_code_t *p) { - p->name = NULL; - p->evhdlr = NULL; - p->cbobject = NULL; + p->nregs = 0; } -static void devdes(pmix_default_event_t *p) -{ - if (NULL != p->name) { - free(p->name); - } -} -PMIX_CLASS_INSTANCE(pmix_default_event_t, - pmix_list_item_t, - devcon, devdes); - PMIX_CLASS_INSTANCE(pmix_active_code_t, pmix_list_item_t, - NULL, NULL); + accon, NULL); static void evcon(pmix_events_t *p) { p->nhdlrs = 0; + p->first = NULL; + p->last = NULL; PMIX_CONSTRUCT(&p->actives, pmix_list_t); PMIX_CONSTRUCT(&p->single_events, pmix_list_t); PMIX_CONSTRUCT(&p->multi_events, pmix_list_t); @@ -684,6 +813,12 @@ static void evcon(pmix_events_t *p) } static void evdes(pmix_events_t *p) { + if (NULL != p->first) { + PMIX_RELEASE(p->first); + } + if (NULL != p->last) { + PMIX_RELEASE(p->last); + } PMIX_LIST_DESTRUCT(&p->actives); PMIX_LIST_DESTRUCT(&p->single_events); PMIX_LIST_DESTRUCT(&p->multi_events); @@ -698,14 +833,13 @@ static void chcon(pmix_event_chain_t *p) memset(p->source.nspace, 0, PMIX_MAX_NSLEN+1); p->source.rank = PMIX_RANK_UNDEF; p->nondefault = false; + p->endchain = false; p->range = PMIX_RANGE_UNDEF; p->info = NULL; p->ninfo = 0; p->results = NULL; p->nresults = 0; - p->sing = NULL; - p->multi = NULL; - p->def = NULL; + p->evhdlr = NULL; p->final_cbfunc = NULL; p->final_cbdata = NULL; } diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c index 5b932942d50..134bece6ea4 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c @@ -22,39 +22,46 @@ #include "src/client/pmix_client_ops.h" #include "src/server/pmix_server_ops.h" #include "src/include/pmix_globals.h" +#include "src/event/pmix_event.h" typedef struct { pmix_object_t super; + volatile bool active; + pmix_event_t ev; size_t index; + bool firstoverall; + bool enviro; pmix_list_t *list; - pmix_list_item_t *item; - pmix_shift_caddy_t *cd; + pmix_event_hdlr_t *hdlr; + void *cd; pmix_status_t *codes; size_t ncodes; pmix_info_t *info; size_t ninfo; + pmix_notification_fn_t evhdlr; + pmix_evhdlr_reg_cbfunc_t evregcbfn; + void *cbdata; } pmix_rshift_caddy_t; static void rscon(pmix_rshift_caddy_t *p) { + p->firstoverall = false; + p->enviro = false; p->list = NULL; - p->item = NULL; + p->hdlr = NULL; p->cd = NULL; p->codes = NULL; p->ncodes = 0; p->info = NULL; p->ninfo = 0; + p->evhdlr = NULL; + p->evregcbfn = NULL; + p->cbdata = NULL; } static void rsdes(pmix_rshift_caddy_t *p) { if (NULL != p->cd) { PMIX_RELEASE(p->cd); } - if (NULL != p->codes) { - free(p->codes); - } - if (NULL != p->info) { - PMIX_INFO_FREE(p->info, p->ninfo); - } } PMIX_CLASS_INSTANCE(pmix_rshift_caddy_t, pmix_object_t, @@ -65,6 +72,7 @@ static void regevents_cbfunc(struct pmix_peer_t *peer, pmix_ptl_hdr_t *hdr, pmix_buffer_t *buf, void *cbdata) { pmix_rshift_caddy_t *rb = (pmix_rshift_caddy_t*)cbdata; + pmix_rshift_caddy_t *cd = (pmix_rshift_caddy_t*)rb->cd; pmix_status_t rc, ret; int cnt; size_t index = rb->index; @@ -78,17 +86,34 @@ static void regevents_cbfunc(struct pmix_peer_t *peer, pmix_ptl_hdr_t *hdr, (PMIX_SUCCESS != ret)) { PMIX_ERROR_LOG(rc); /* remove the err handler and call the error handler reg completion callback fn.*/ - if (NULL != rb->list && NULL != rb->item) { - pmix_list_remove_item(rb->list, rb->item); - PMIX_RELEASE(rb->item); + if (NULL == rb->list) { + if (NULL != rb->hdlr) { + PMIX_RELEASE(rb->hdlr); + } + if (rb->firstoverall) { + pmix_globals.events.first = NULL; + } else { + pmix_globals.events.last = NULL; + } + } else if (NULL != rb->hdlr) { + pmix_list_remove_item(rb->list, &rb->hdlr->super); + PMIX_RELEASE(rb->hdlr); } ret = PMIX_ERR_SERVER_FAILED_REQUEST; index = UINT_MAX; } /* call the callback */ - if (NULL != rb->cd && NULL != rb->cd->cbfunc.evregcbfn) { - rb->cd->cbfunc.evregcbfn(ret, index, rb->cd->cbdata); + if (NULL != cd && NULL != cd->evregcbfn) { + cd->evregcbfn(ret, index, cd->cbdata); + } + /* release any info we brought along as they are + * internally generated and not provided by the caller */ + if (NULL!= rb->info) { + PMIX_INFO_FREE(rb->info, rb->ninfo); + } + if (NULL != rb->codes) { + free(rb->codes); } PMIX_RELEASE(rb); } @@ -96,29 +121,47 @@ static void regevents_cbfunc(struct pmix_peer_t *peer, pmix_ptl_hdr_t *hdr, static void reg_cbfunc(pmix_status_t status, void *cbdata) { pmix_rshift_caddy_t *rb = (pmix_rshift_caddy_t*)cbdata; + pmix_rshift_caddy_t *cd = (pmix_rshift_caddy_t*)rb->cd; pmix_status_t rc = status; size_t index = rb->index; if (PMIX_SUCCESS != status) { /* if we failed to register, then remove this event */ - if (NULL != rb->list && NULL != rb->item) { - pmix_list_remove_item(rb->list, rb->item); - PMIX_RELEASE(rb->item); - rc = PMIX_ERR_SERVER_FAILED_REQUEST; - index = UINT_MAX; + if (NULL == rb->list) { + if (NULL != rb->hdlr) { + PMIX_RELEASE(rb->hdlr); + } + if (rb->firstoverall) { + pmix_globals.events.first = NULL; + } else { + pmix_globals.events.last = NULL; + } + } else if (NULL != rb->hdlr) { + pmix_list_remove_item(rb->list, &rb->hdlr->super); + PMIX_RELEASE(rb->hdlr); } + rc = PMIX_ERR_SERVER_FAILED_REQUEST; + index = UINT_MAX; } - if (NULL != rb->cd && NULL != rb->cd->cbfunc.evregcbfn) { + if (NULL != cd && NULL != cd->evregcbfn) { /* pass back our local index */ - rb->cd->cbfunc.evregcbfn(rc, index, rb->cd->cbdata); + cd->evregcbfn(rc, index, cd->cbdata); + } + /* release any info we brought along as they are + * internally generated and not provided by the caller */ + if (NULL!= rb->info) { + PMIX_INFO_FREE(rb->info, rb->ninfo); + } + if (NULL != rb->codes) { + free(rb->codes); } - PMIX_RELEASE(rb); } static pmix_status_t _send_to_server(pmix_rshift_caddy_t *rcd) { + pmix_rshift_caddy_t *cd = (pmix_rshift_caddy_t*)rcd->cd; pmix_status_t rc; pmix_buffer_t *msg; pmix_cmd_t cmd=PMIX_REGEVENTS_CMD; @@ -130,13 +173,13 @@ static pmix_status_t _send_to_server(pmix_rshift_caddy_t *rcd) return rc; } /* pack the number of codes */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &rcd->cd->ncodes, 1, PMIX_SIZE))) { + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cd->ncodes, 1, PMIX_SIZE))) { PMIX_ERROR_LOG(rc); return rc; } /* pack any provided codes - may be NULL */ - if (NULL != rcd->cd->codes && 0 < rcd->cd->ncodes) { - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, rcd->cd->codes, rcd->cd->ncodes, PMIX_STATUS))) { + if (NULL != cd->codes && 0 < cd->ncodes) { + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, cd->codes, cd->ncodes, PMIX_STATUS))) { PMIX_ERROR_LOG(rc); return rc; } @@ -163,9 +206,7 @@ static pmix_status_t _send_to_server(pmix_rshift_caddy_t *rcd) return rc; } -static pmix_status_t _add_hdlr(pmix_list_t *list, pmix_list_item_t *item, - size_t index, bool prepend, pmix_list_t *xfer, - pmix_shift_caddy_t *cd) +static pmix_status_t _add_hdlr(pmix_rshift_caddy_t *cd, pmix_list_t *xfer) { pmix_rshift_caddy_t *cd2; pmix_info_caddy_t *ixfer; @@ -177,12 +218,6 @@ static pmix_status_t _add_hdlr(pmix_list_t *list, pmix_list_item_t *item, pmix_output_verbose(2, pmix_globals.debug_output, "pmix: _add_hdlr"); - if (prepend) { - pmix_list_prepend(list, item); - } else { - pmix_list_append(list, item); - } - /* check to see if we have an active registration on these codes */ if (NULL == cd->codes) { registered = false; @@ -190,15 +225,15 @@ static pmix_status_t _add_hdlr(pmix_list_t *list, pmix_list_item_t *item, if (PMIX_MAX_ERR_CONSTANT == active->code) { /* we have registered a default */ registered = true; + ++active->nregs; break; } } if (!registered) { active = PMIX_NEW(pmix_active_code_t); active->code = PMIX_MAX_ERR_CONSTANT; + active->nregs = 1; pmix_list_append(&pmix_globals.events.actives, &active->super); - /* ensure we register it */ - need_register = true; } } else { for (n=0; n < cd->ncodes; n++) { @@ -206,12 +241,14 @@ static pmix_status_t _add_hdlr(pmix_list_t *list, pmix_list_item_t *item, PMIX_LIST_FOREACH(active, &pmix_globals.events.actives, pmix_active_code_t) { if (active->code == cd->codes[n]) { registered = true; + ++active->nregs; break; } } if (!registered) { active = PMIX_NEW(pmix_active_code_t); active->code = cd->codes[n]; + active->nregs = 1; pmix_list_append(&pmix_globals.events.actives, &active->super); /* ensure we register it */ need_register = true; @@ -221,9 +258,10 @@ static pmix_status_t _add_hdlr(pmix_list_t *list, pmix_list_item_t *item, /* prep next step */ cd2 = PMIX_NEW(pmix_rshift_caddy_t); - cd2->index = index; - cd2->list = list; - cd2->item = item; + cd2->index = cd->index; + cd2->firstoverall = cd->firstoverall; + cd2->list = cd->list; + cd2->hdlr = cd->hdlr; PMIX_RETAIN(cd); cd2->cd = cd; cd2->ninfo = pmix_list_get_size(xfer); @@ -249,9 +287,10 @@ static pmix_status_t _add_hdlr(pmix_list_t *list, pmix_list_item_t *item, if (PMIX_SUCCESS != (rc = _send_to_server(cd2))) { pmix_output_verbose(2, pmix_globals.debug_output, "pmix: add_hdlr - pack send_to_server failed status=%d", rc); + if (NULL != cd2->info) { + PMIX_INFO_FREE(cd2->info, cd2->ninfo); + } PMIX_RELEASE(cd2); - pmix_list_remove_item(list, item); - PMIX_RELEASE(item); return rc; } return PMIX_ERR_WOULD_BLOCK; @@ -267,13 +306,17 @@ static pmix_status_t _add_hdlr(pmix_list_t *list, pmix_list_item_t *item, if (PMIX_SUCCESS != (rc = pmix_host_server.register_events(cd->codes, cd->ncodes, cd2->info, cd2->ninfo, reg_cbfunc, cd2))) { + if (NULL != cd2->info) { + PMIX_INFO_FREE(cd2->info, cd2->ninfo); + } PMIX_RELEASE(cd2); - pmix_list_remove_item(list, item); - PMIX_RELEASE(item); return rc; } return PMIX_ERR_WOULD_BLOCK; } else { + if (NULL != cd2->info) { + PMIX_INFO_FREE(cd2->info, cd2->ninfo); + } PMIX_RELEASE(cd2); } @@ -284,15 +327,18 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) { size_t index = 0, n; pmix_status_t rc; - pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata; - pmix_single_event_t *sing; - pmix_multi_event_t *multi; - pmix_default_event_t *def; - bool prepend = false; - char *name = NULL; + pmix_rshift_caddy_t *cd = (pmix_rshift_caddy_t*)cbdata; + pmix_event_hdlr_t *evhdlr, *ev; + uint8_t location = PMIX_EVENT_ORDER_NONE; + char *name = NULL, *locator = NULL; + bool firstoverall=false, lastoverall=false; + bool found; pmix_list_t xfer; pmix_info_caddy_t *ixfer; void *cbobject = NULL; + pmix_data_range_t range = PMIX_RANGE_UNDEF; + pmix_proc_t *parray = NULL; + size_t nprocs; pmix_output_verbose(2, pmix_globals.debug_output, "pmix: register event_hdlr with %d infos", (int)cd->ninfo); @@ -302,16 +348,60 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) /* if directives were included */ if (NULL != cd->info) { for (n=0; n < cd->ninfo; n++) { - if (0 == strcmp(cd->info[n].key, PMIX_EVENT_ORDER_PREPEND)) { - /* flag if they asked to prepend this event - * on the precedence order */ - prepend = true; - } else if (0 == strcmp(cd->info[n].key, PMIX_EVENT_HDLR_NAME)) { + if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_FIRST, PMIX_MAX_KEYLEN)) { + /* flag if they asked to put this one first overall */ + if (PMIX_UNDEF == cd->info[n].value.type || + cd->info[n].value.data.flag) { + firstoverall = true; + } + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_LAST, PMIX_MAX_KEYLEN)) { + /* flag if they asked to put this one last overall */ + if (PMIX_UNDEF == cd->info[n].value.type || + cd->info[n].value.data.flag) { + lastoverall = true; + } + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_PREPEND, PMIX_MAX_KEYLEN)) { + /* flag if they asked to prepend this handler */ + if (PMIX_UNDEF == cd->info[n].value.type || + cd->info[n].value.data.flag) { + location = PMIX_EVENT_ORDER_PREPEND; + } + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_APPEND, PMIX_MAX_KEYLEN)) { + /* flag if they asked to append this handler */ + if (PMIX_UNDEF == cd->info[n].value.type || + cd->info[n].value.data.flag) { + location = PMIX_EVENT_ORDER_APPEND; + } + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { name = cd->info[n].value.data.string; - } else if (0 == strcmp(cd->info[n].key, PMIX_EVENT_ENVIRO_LEVEL)) { - cd->enviro = cd->info[n].value.data.flag; - } else if (0 == strcmp(cd->info[n].key, PMIX_EVENT_RETURN_OBJECT)) { + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_ENVIRO_LEVEL, PMIX_MAX_KEYLEN)) { + if (PMIX_UNDEF == cd->info[n].value.type || + cd->info[n].value.data.flag) { + cd->enviro = true; + } + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_RETURN_OBJECT, PMIX_MAX_KEYLEN)) { cbobject = cd->info[n].value.data.ptr; + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_FIRST_IN_CATEGORY, PMIX_MAX_KEYLEN)) { + if (PMIX_UNDEF == cd->info[n].value.type || + cd->info[n].value.data.flag) { + location = PMIX_EVENT_ORDER_FIRST; + } + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_LAST_IN_CATEGORY, PMIX_MAX_KEYLEN)) { + if (PMIX_UNDEF == cd->info[n].value.type || + cd->info[n].value.data.flag) { + location = PMIX_EVENT_ORDER_LAST; + } + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_BEFORE, PMIX_MAX_KEYLEN)) { + location = PMIX_EVENT_ORDER_BEFORE; + locator = cd->info[n].value.data.string; + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_HDLR_AFTER, PMIX_MAX_KEYLEN)) { + location = PMIX_EVENT_ORDER_AFTER; + locator = cd->info[n].value.data.string; + } else if (0 == strncmp(cd->info[n].key, PMIX_RANGE, PMIX_MAX_KEYLEN)) { + range = cd->info[n].value.data.range; + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_CUSTOM_RANGE, PMIX_MAX_KEYLEN)) { + parray = (pmix_proc_t*)cd->info[n].value.data.darray->array; + nprocs = cd->info[n].value.data.darray->size; } else { ixfer = PMIX_NEW(pmix_info_caddy_t); ixfer->info = &cd->info[n]; @@ -320,51 +410,64 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) } } - /* if the code array is NULL, then this is a default event - * registration request */ - if (NULL == cd->codes) { - def = PMIX_NEW(pmix_default_event_t); - if (NULL != name) { - def->name = strdup(name); - } - index = pmix_globals.events.nhdlrs; - ++pmix_globals.events.nhdlrs; - def->index = index; - def->evhdlr = cd->evhdlr; - def->cbobject = cbobject; - rc = _add_hdlr(&pmix_globals.events.default_events, &def->super, - index, prepend, &xfer, cd); - PMIX_LIST_DESTRUCT(&xfer); - if (PMIX_SUCCESS != rc && - PMIX_ERR_WOULD_BLOCK != rc) { - /* unable to register */ - --pmix_globals.events.nhdlrs; - rc = PMIX_ERR_EVENT_REGISTRATION; + /* if they indicated this is to be the "first" or "last" event, then + * first check to ensure they didn't already direct some + * other event into the same cherished position */ + if (firstoverall || lastoverall) { + if ((firstoverall && NULL != pmix_globals.events.first) || + (lastoverall && NULL != pmix_globals.events.last)) { + /* oops - someone already took that position */ index = UINT_MAX; + rc = PMIX_ERR_EVENT_REGISTRATION; goto ack; } - if (PMIX_ERR_WOULD_BLOCK == rc) { - /* the callback will provide our response */ - PMIX_RELEASE(cd); - return; + evhdlr = PMIX_NEW(pmix_event_hdlr_t); + if (NULL == evhdlr) { + index = UINT_MAX; + rc = PMIX_ERR_EVENT_REGISTRATION; + goto ack; } - goto ack; - } - - /* if there is only one code, then this is a single event registration */ - if (1 == cd->ncodes) { - sing = PMIX_NEW(pmix_single_event_t); if (NULL != name) { - sing->name = strdup(name); + evhdlr->name = strdup(name); } - sing->code = cd->codes[0]; index = pmix_globals.events.nhdlrs; - sing->index = index; - sing->evhdlr = cd->evhdlr; + evhdlr->index = index; ++pmix_globals.events.nhdlrs; - sing->cbobject = cbobject; - rc = _add_hdlr(&pmix_globals.events.single_events, &sing->super, - index, prepend, &xfer, cd); + evhdlr->rng.range = range; + if (NULL != parray) { + evhdlr->rng.nprocs = nprocs; + PMIX_PROC_CREATE(evhdlr->rng.procs, nprocs); + if (NULL == evhdlr->rng.procs) { + index = UINT_MAX; + rc = PMIX_ERR_EVENT_REGISTRATION; + PMIX_RELEASE(evhdlr); + goto ack; + } + memcpy(evhdlr->rng.procs, parray, nprocs * sizeof(pmix_proc_t)); + } + evhdlr->evhdlr = cd->evhdlr; + evhdlr->cbobject = cbobject; + if (NULL != cd->codes) { + evhdlr->codes = (pmix_status_t*)malloc(cd->ncodes * sizeof(pmix_status_t)); + if (NULL == evhdlr->codes) { + PMIX_RELEASE(evhdlr); + index = UINT_MAX; + rc = PMIX_ERR_EVENT_REGISTRATION; + goto ack; + } + memcpy(evhdlr->codes, cd->codes, cd->ncodes * sizeof(pmix_status_t)); + evhdlr->ncodes = cd->ncodes; + } + if (firstoverall) { + pmix_globals.events.first = evhdlr; + } else { + pmix_globals.events.last = evhdlr; + } + cd->index = index; + cd->list = NULL; + cd->hdlr = evhdlr; + cd->firstoverall = firstoverall; + rc = _add_hdlr(cd, &xfer); PMIX_LIST_DESTRUCT(&xfer); if (PMIX_SUCCESS != rc && PMIX_ERR_WOULD_BLOCK != rc) { @@ -372,6 +475,12 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) --pmix_globals.events.nhdlrs; rc = PMIX_ERR_EVENT_REGISTRATION; index = UINT_MAX; + if (firstoverall) { + pmix_globals.events.first = NULL; + } else { + pmix_globals.events.last = NULL; + } + PMIX_RELEASE(evhdlr); goto ack; } if (PMIX_ERR_WOULD_BLOCK == rc) { @@ -382,30 +491,174 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) goto ack; } - /* must be a multi-code registration */ - multi = PMIX_NEW(pmix_multi_event_t); + /* get here if this isn't an overall first or last event - start + * by creating an event */ + evhdlr = PMIX_NEW(pmix_event_hdlr_t); + if (NULL == evhdlr) { + index = UINT_MAX; + rc = PMIX_ERR_EVENT_REGISTRATION; + goto ack; + } if (NULL != name) { - multi->name = strdup(name); + evhdlr->name = strdup(name); } - multi->codes = (pmix_status_t*)malloc(cd->ncodes * sizeof(pmix_status_t)); - multi->ncodes = cd->ncodes; - memcpy(multi->codes, cd->codes, cd->ncodes * sizeof(pmix_status_t)); index = pmix_globals.events.nhdlrs; - multi->index = index; - multi->evhdlr = cd->evhdlr; + evhdlr->index = index; ++pmix_globals.events.nhdlrs; - multi->cbobject = cbobject; - rc = _add_hdlr(&pmix_globals.events.multi_events, &multi->super, - index, prepend, &xfer, cd); + evhdlr->precedence = location; + evhdlr->locator = locator; + evhdlr->rng.range = range; + if (NULL != parray) { + evhdlr->rng.nprocs = nprocs; + PMIX_PROC_CREATE(evhdlr->rng.procs, nprocs); + if (NULL == evhdlr->rng.procs) { + index = UINT_MAX; + rc = PMIX_ERR_EVENT_REGISTRATION; + PMIX_RELEASE(evhdlr); + goto ack; + } + memcpy(evhdlr->rng.procs, parray, nprocs * sizeof(pmix_proc_t)); + } + evhdlr->evhdlr = cd->evhdlr; + evhdlr->cbobject = cbobject; + if (NULL == cd->codes) { + /* this is a default handler */ + cd->list = &pmix_globals.events.default_events; + } else { + evhdlr->codes = (pmix_status_t*)malloc(cd->ncodes * sizeof(pmix_status_t)); + if (NULL == evhdlr->codes) { + PMIX_RELEASE(evhdlr); + index = UINT_MAX; + rc = PMIX_ERR_EVENT_REGISTRATION; + goto ack; + } + memcpy(evhdlr->codes, cd->codes, cd->ncodes * sizeof(pmix_status_t)); + evhdlr->ncodes = cd->ncodes; + if (1 == cd->ncodes) { + cd->list = &pmix_globals.events.single_events; + } else { + cd->list = &pmix_globals.events.multi_events; + } + } + /* setup to add the handler */ + cd->index = index; + cd->hdlr = evhdlr; + cd->firstoverall = false; + /* tell the server about it, if necessary - any actions + * will be deferred until after this event completes */ + if (PMIX_RANGE_PROC_LOCAL == range) { + rc = PMIX_SUCCESS; + } else { + rc = _add_hdlr(cd, &xfer); + } PMIX_LIST_DESTRUCT(&xfer); if (PMIX_SUCCESS != rc && PMIX_ERR_WOULD_BLOCK != rc) { - /* unable to register */ + /* unable to register */ --pmix_globals.events.nhdlrs; rc = PMIX_ERR_EVENT_REGISTRATION; index = UINT_MAX; + PMIX_RELEASE(evhdlr); goto ack; } + /* now add this event to the appropriate list - if the registration + * subsequently fails, it will be removed */ + + /* if the list is empty, or no location was specified, just put this on it */ + if (0 == pmix_list_get_size(cd->list) || + PMIX_EVENT_ORDER_NONE == location) { + pmix_list_prepend(cd->list, &evhdlr->super); + } else if (PMIX_EVENT_ORDER_FIRST == location) { + /* see if the first handler on the list was also declared as "first" */ + ev = (pmix_event_hdlr_t*)pmix_list_get_first(cd->list); + if (PMIX_EVENT_ORDER_FIRST == ev->precedence) { + /* this is an error */ + --pmix_globals.events.nhdlrs; + rc = PMIX_ERR_EVENT_REGISTRATION; + index = UINT_MAX; + PMIX_RELEASE(evhdlr); + goto ack; + } + /* prepend it to the list */ + pmix_list_prepend(cd->list, &evhdlr->super); + } else if (PMIX_EVENT_ORDER_LAST == location) { + /* see if the last handler on the list was also declared as "last" */ + ev = (pmix_event_hdlr_t*)pmix_list_get_last(cd->list); + if (PMIX_EVENT_ORDER_LAST == ev->precedence) { + /* this is an error */ + --pmix_globals.events.nhdlrs; + rc = PMIX_ERR_EVENT_REGISTRATION; + index = UINT_MAX; + PMIX_RELEASE(evhdlr); + goto ack; + } + /* append it to the list */ + pmix_list_append(cd->list, &evhdlr->super); + } else if (PMIX_EVENT_ORDER_PREPEND == location) { + /* we know the list isn't empty - check the first element to see if + * it is designated to be "first". If so, then we need to put this + * right after it */ + ev = (pmix_event_hdlr_t*)pmix_list_get_first(cd->list); + if (PMIX_EVENT_ORDER_FIRST == ev->precedence) { + ev = (pmix_event_hdlr_t*)pmix_list_get_next(&ev->super); + if (NULL != ev) { + pmix_list_insert_pos(cd->list, &ev->super, &evhdlr->super); + } else { + /* we are at the end of the list */ + pmix_list_append(cd->list, &evhdlr->super); + } + } else { + pmix_list_prepend(cd->list, &evhdlr->super); + } + } else if (PMIX_EVENT_ORDER_APPEND == location) { + /* we know the list isn't empty - check the last element to see if + * it is designated to be "last". If so, then we need to put this + * right before it */ + ev = (pmix_event_hdlr_t*)pmix_list_get_last(cd->list); + if (PMIX_EVENT_ORDER_LAST == ev->precedence) { + pmix_list_insert_pos(cd->list, &ev->super, &evhdlr->super); + } else { + pmix_list_append(cd->list, &evhdlr->super); + } + } else { + /* find the named event */ + found = false; + PMIX_LIST_FOREACH(ev, cd->list, pmix_event_hdlr_t) { + if (NULL == ev->name) { + continue; + } + if (0 == strcmp(ev->name, name)) { + if (PMIX_EVENT_ORDER_BEFORE == location) { + /* put it before this handler */ + pmix_list_insert_pos(cd->list, &ev->super, &evhdlr->super); + } else { + /* put it after this handler */ + ev = (pmix_event_hdlr_t*)pmix_list_get_next(&ev->super); + if (NULL != ev) { + pmix_list_insert_pos(cd->list, &ev->super, &evhdlr->super); + } else { + /* we are at the end of the list */ + pmix_list_append(cd->list, &evhdlr->super); + } + } + found = true; + break; + } + } + /* if the handler wasn't found, then we return an error. At some + * future time, we may change this behavior and cache this handler + * until the reference one has been registered. However, this could + * turn out to be a laborious search procedure as the reference + * event handler may in turn be dependent on another handler, etc. */ + if (!found) { + /* this is an error */ + --pmix_globals.events.nhdlrs; + rc = PMIX_ERR_EVENT_REGISTRATION; + index = UINT_MAX; + PMIX_RELEASE(evhdlr); + goto ack; + } + } if (PMIX_ERR_WOULD_BLOCK == rc) { /* the callback will provide our response */ PMIX_RELEASE(cd); @@ -415,7 +668,9 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) ack: /* acknowledge the registration so the caller can release * their data */ - cd->cbfunc.evregcbfn(rc, index, cd->cbdata); + if (NULL != cd->evregcbfn) { + cd->evregcbfn(rc, index, cd->cbdata); + } PMIX_RELEASE(cd); } @@ -426,17 +681,17 @@ PMIX_EXPORT void PMIx_Register_event_handler(pmix_status_t codes[], size_t ncode pmix_evhdlr_reg_cbfunc_t cbfunc, void *cbdata) { - pmix_shift_caddy_t *cd; + pmix_rshift_caddy_t *cd; /* need to thread shift this request so we can access * our global data to register this *local* event handler */ - cd = PMIX_NEW(pmix_shift_caddy_t); + cd = PMIX_NEW(pmix_rshift_caddy_t); cd->codes = codes; cd->ncodes = ncodes; cd->info = info; cd->ninfo = ninfo; cd->evhdlr = event_hdlr; - cd->cbfunc.errregcbfn = cbfunc; + cd->evregcbfn = cbfunc; cd->cbdata = cbdata; pmix_output_verbose(2, pmix_globals.debug_output, @@ -449,14 +704,12 @@ static void dereg_event_hdlr(int sd, short args, void *cbdata) { pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata; pmix_buffer_t *msg = NULL; - pmix_single_event_t *sing, *s2; - pmix_multi_event_t *multi, *m2; - pmix_default_event_t *def; + pmix_event_hdlr_t *evhdlr, *ev; pmix_cmd_t cmd = PMIX_DEREGEVENTS_CMD; pmix_status_t rc = PMIX_SUCCESS; pmix_status_t wildcard = PMIX_MAX_ERR_CONSTANT; size_t n; - bool found, foundcode; + pmix_active_code_t *active; /* if I am not the server, then I need to notify the server * to remove my registration */ @@ -468,101 +721,131 @@ static void dereg_event_hdlr(int sd, short args, void *cbdata) } } - /* the registration can be in any of three places, so check them all */ - PMIX_LIST_FOREACH(def, &pmix_globals.events.default_events, pmix_default_event_t) { - if (def->index == cd->ref) { + /* check the first and last locations */ + if ((NULL != pmix_globals.events.first && pmix_globals.events.first->index == cd->ref) || + (NULL != pmix_globals.events.last && pmix_globals.events.last->index == cd->ref)) { + /* found it */ + if (NULL != pmix_globals.events.first && pmix_globals.events.first->index == cd->ref) { + ev = pmix_globals.events.first; + } else { + ev = pmix_globals.events.last; + } + if (NULL != msg) { + /* if this is a default handler, see if any other default + * handlers remain */ + if (NULL == ev->codes) { + if (0 == pmix_list_get_size(&pmix_globals.events.default_events)) { + /* tell the server to dereg our default handler */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &wildcard, 1, PMIX_STATUS))) { + PMIX_RELEASE(msg); + goto cleanup; + } + } + } else { + for (n=0; n < ev->ncodes; n++) { + /* see if this is the last registration we have for this code */ + PMIX_LIST_FOREACH(active, &pmix_globals.events.actives, pmix_active_code_t) { + if (active->code == ev->codes[n]) { + --active->nregs; + if (0 == active->nregs) { + pmix_list_remove_item(&pmix_globals.events.actives, &active->super); + /* tell the server to dereg this code */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &active->code, 1, PMIX_STATUS))) { + PMIX_RELEASE(active); + PMIX_RELEASE(msg); + goto cleanup; + } + PMIX_RELEASE(active); + } + break; + } + } + } + } + } + if (ev == pmix_globals.events.first) { + pmix_globals.events.first = NULL; + } else { + pmix_globals.events.last = NULL; + } + PMIX_RELEASE(ev); + goto cleanup; + } + + /* the registration can be in any of three places, so check each of them */ + PMIX_LIST_FOREACH(evhdlr, &pmix_globals.events.default_events, pmix_event_hdlr_t) { + if (evhdlr->index == cd->ref) { /* found it */ - pmix_list_remove_item(&pmix_globals.events.default_events, &def->super); + pmix_list_remove_item(&pmix_globals.events.default_events, &evhdlr->super); if (NULL != msg) { /* if there are no more default handlers registered, tell * the server to dereg the default handler */ if (0 == pmix_list_get_size(&pmix_globals.events.default_events)) { - n = 1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &n, 1, PMIX_SIZE))) { - PMIX_RELEASE(msg); - goto cleanup; - } if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &wildcard, 1, PMIX_STATUS))) { PMIX_RELEASE(msg); goto cleanup; } } } - PMIX_RELEASE(def); + PMIX_RELEASE(evhdlr); goto report; } } - PMIX_LIST_FOREACH(sing, &pmix_globals.events.single_events, pmix_single_event_t) { - if (sing->index == cd->ref) { + PMIX_LIST_FOREACH(evhdlr, &pmix_globals.events.single_events, pmix_event_hdlr_t) { + if (evhdlr->index == cd->ref) { /* found it */ - pmix_list_remove_item(&pmix_globals.events.single_events, &sing->super); + pmix_list_remove_item(&pmix_globals.events.single_events, &evhdlr->super); if (NULL != msg) { - /* if there are no more handlers registered for this code, tell - * the server to dereg the handler for this code */ - found = false; - PMIX_LIST_FOREACH(s2, &pmix_globals.events.single_events, pmix_single_event_t) { - if (s2->code == sing->code) { - found = true; + /* see if this is the last registration we have for this code */ + PMIX_LIST_FOREACH(active, &pmix_globals.events.actives, pmix_active_code_t) { + if (active->code == evhdlr->codes[0]) { + --active->nregs; + if (0 == active->nregs) { + pmix_list_remove_item(&pmix_globals.events.actives, &active->super); + if (NULL != msg) { + /* tell the server to dereg this code */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &active->code, 1, PMIX_STATUS))) { + PMIX_RELEASE(active); + PMIX_RELEASE(msg); + goto cleanup; + } + } + PMIX_RELEASE(active); + } break; } } - if (!found) { - n = 1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &n, 1, PMIX_SIZE))) { - PMIX_RELEASE(msg); - PMIX_RELEASE(sing); - goto cleanup; - } - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &sing->code, 1, PMIX_STATUS))) { - PMIX_RELEASE(msg); - PMIX_RELEASE(sing); - goto cleanup; - } - } } - PMIX_RELEASE(sing); + PMIX_RELEASE(evhdlr); goto report; } } - PMIX_LIST_FOREACH(multi, &pmix_globals.events.multi_events, pmix_multi_event_t) { - if (multi->index == cd->ref) { + PMIX_LIST_FOREACH(evhdlr, &pmix_globals.events.multi_events, pmix_event_hdlr_t) { + if (evhdlr->index == cd->ref) { /* found it */ - pmix_list_remove_item(&pmix_globals.events.multi_events, &multi->super); - if (NULL != msg) { - /* if there are no more handlers registered for this code, tell - * the server to dereg the handler for this code */ - found = false; - PMIX_LIST_FOREACH(m2, &pmix_globals.events.multi_events, pmix_multi_event_t) { - if (m2->ncodes != multi->ncodes) { - continue; - } - foundcode = true; - for (n=0; n < multi->ncodes; n++) { - if (m2->codes[n] != multi->codes[n]) { - foundcode = false; - break; + pmix_list_remove_item(&pmix_globals.events.multi_events, &evhdlr->super); + for (n=0; n < evhdlr->ncodes; n++) { + /* see if this is the last registration we have for this code */ + PMIX_LIST_FOREACH(active, &pmix_globals.events.actives, pmix_active_code_t) { + if (active->code == evhdlr->codes[n]) { + --active->nregs; + if (0 == active->nregs) { + pmix_list_remove_item(&pmix_globals.events.actives, &active->super); + if (NULL != msg) { + /* tell the server to dereg this code */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &active->code, 1, PMIX_STATUS))) { + PMIX_RELEASE(active); + PMIX_RELEASE(msg); + goto cleanup; + } + } + PMIX_RELEASE(active); } - } - if (foundcode) { - found = true; break; } } - if (!found) { - n = multi->ncodes; - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &n, 1, PMIX_SIZE))) { - PMIX_RELEASE(msg); - PMIX_RELEASE(multi); - goto cleanup; - } - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &multi->codes, n, PMIX_STATUS))) { - PMIX_RELEASE(msg); - PMIX_RELEASE(multi); - goto cleanup; - } - } } - PMIX_RELEASE(multi); + PMIX_RELEASE(evhdlr); goto report; } } diff --git a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.c b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.c index 8cddeb5d443..bdfb143c9af 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.c +++ b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.c @@ -257,6 +257,8 @@ static void qcon(pmix_query_caddy_t *p) { p->queries = NULL; p->nqueries = 0; + p->targets = NULL; + p->ntargets = 0; p->info = NULL; p->ninfo = 0; p->cbfunc = NULL; diff --git a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.h b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.h index 85560390d6a..1333cb24f1f 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.h +++ b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.h @@ -72,7 +72,9 @@ typedef enum { PMIX_DEREGEVENTS_CMD, PMIX_QUERY_CMD, PMIX_LOG_CMD, - PMIX_ALLOC_CMD + PMIX_ALLOC_CMD, + PMIX_JOB_CONTROL_CMD, + PMIX_MONITOR_CMD } pmix_cmd_t; /* provide a "pretty-print" function for cmds */ @@ -214,6 +216,8 @@ typedef struct { pmix_status_t status; pmix_query_t *queries; size_t nqueries; + pmix_proc_t *targets; + size_t ntargets; pmix_info_t *info; size_t ninfo; pmix_info_cbfunc_t cbfunc; diff --git a/opal/mca/pmix/pmix2x/pmix/src/include/types.h b/opal/mca/pmix/pmix2x/pmix/src/include/types.h index d46df75ec8f..7c073ccf4f8 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/include/types.h +++ b/opal/mca/pmix/pmix2x/pmix/src/include/types.h @@ -256,4 +256,13 @@ typedef struct event pmix_event_t; #define pmix_event_active(x, y, z) event_active((x), (y), (z)) +#define pmix_event_evtimer_new(b, cb, arg) pmix_event_new((b), -1, 0, (cb), (arg)) + +#define pmix_event_evtimer_add(x, tv) pmix_event_add((x), (tv)) + +#define pmix_event_evtimer_set(b, x, cb, arg) event_assign((x), (b), -1, 0, (event_callback_fn) (cb), (arg)) + +#define pmix_event_evtimer_del(x) pmix_event_del((x)) + + #endif /* PMIX_TYPES_H */ diff --git a/orte/mca/sensor/Makefile.am b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/Makefile.am similarity index 62% rename from orte/mca/sensor/Makefile.am rename to opal/mca/pmix/pmix2x/pmix/src/mca/psensor/Makefile.am index 2e59fe28eba..81072424d0e 100644 --- a/orte/mca/sensor/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/Makefile.am @@ -3,26 +3,27 @@ # # Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ -# +# # Additional copyrights may follow -# +# # $HEADER$ # +AM_CPPFLAGS = $(LTDLINCL) + # main library setup -noinst_LTLIBRARIES = libmca_sensor.la -libmca_sensor_la_SOURCES = +noinst_LTLIBRARIES = libmca_psensor.la +libmca_psensor_la_SOURCES = # local files -headers = sensor.h \ - sensor_types.h +headers = psensor.h -libmca_sensor_la_SOURCES += $(headers) +libmca_psensor_la_SOURCES += $(headers) # Conditionally install the header files if WANT_INSTALL_HEADERS -ortedir = $(ompiincludedir)/$(subdir) -nobase_orte_HEADERS = $(headers) +pmixdir = $(pmixincludedir)/$(subdir) +nobase_pmix_HEADERS = $(headers) endif include base/Makefile.am diff --git a/orte/mca/sensor/base/Makefile.am b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/Makefile.am similarity index 60% rename from orte/mca/sensor/base/Makefile.am rename to opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/Makefile.am index 7155261700a..fe9c53ed553 100644 --- a/orte/mca/sensor/base/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. # # Copyright (c) 2017 Intel, Inc. All rights reserved. @@ -11,10 +11,9 @@ # headers += \ - base/base.h \ - base/sensor_private.h + base/base.h -libmca_sensor_la_SOURCES += \ - base/sensor_base_frame.c \ - base/sensor_base_select.c \ - base/sensor_base_fns.c +libmca_psensor_la_SOURCES += \ + base/psensor_base_frame.c \ + base/psensor_base_select.c \ + base/psensor_base_stubs.c diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/base.h b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/base.h new file mode 100644 index 00000000000..a01437acff2 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/base.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved. + * + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + */ + +#ifndef PMIX_PSENSOR_BASE_H_ +#define PMIX_PSENSOR_BASE_H_ + +#include + +#include "src/class/pmix_list.h" +#include "src/mca/mca.h" +#include "src/mca/base/pmix_mca_base_framework.h" + +#include "src/mca/psensor/psensor.h" + +BEGIN_C_DECLS + +/* + * MCA Framework + */ +PMIX_EXPORT extern pmix_mca_base_framework_t pmix_psensor_base_framework; + +PMIX_EXPORT int pmix_psensor_base_select(void); + +/* define a struct to hold framework-global values */ +typedef struct { + pmix_list_t actives; + pmix_event_base_t *evbase; +} pmix_psensor_base_t; + +typedef struct { + pmix_list_item_t super; + pmix_psensor_base_component_t *component; + pmix_psensor_base_module_t *module; + int priority; +} pmix_psensor_active_module_t; +PMIX_CLASS_DECLARATION(pmix_psensor_active_module_t); + +PMIX_EXPORT extern pmix_psensor_base_t pmix_psensor_base; + +PMIX_EXPORT pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t error, + const pmix_info_t *monitor, + const pmix_info_t directives[], size_t ndirs); + +PMIX_EXPORT pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor, + char *id); + +END_C_DECLS +#endif diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_frame.c b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_frame.c new file mode 100644 index 00000000000..d10bab1cb1f --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_frame.c @@ -0,0 +1,103 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. + * + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include + +#include + +#include +#include PMIX_EVENT_HEADER + +#include "src/mca/mca.h" +#include "src/mca/base/base.h" +#include "src/class/pmix_list.h" +#include "src/runtime/pmix_progress_threads.h" +#include "src/include/types.h" + +#include "src/mca/psensor/base/base.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "src/mca/psensor/base/static-components.h" + +/* + * Global variables + */ +pmix_psensor_base_module_t pmix_psensor = { + pmix_psensor_base_start, + pmix_psensor_base_stop +}; +pmix_psensor_base_t pmix_psensor_base = {{{0}}}; + +static bool use_separate_thread = false; + +static int pmix_psensor_register(pmix_mca_base_register_flag_t flags) +{ + (void) pmix_mca_base_var_register("pmix", "psensor", "base", "use_separate_thread", + "Use a separate thread for monitoring local procs", + PMIX_MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + PMIX_INFO_LVL_9, + PMIX_MCA_BASE_VAR_SCOPE_READONLY, + &use_separate_thread); + return PMIX_SUCCESS; +} + + +static int pmix_psensor_base_close(void) +{ + PMIX_LIST_DESTRUCT(&pmix_psensor_base.actives); + + if (use_separate_thread && NULL != pmix_psensor_base.evbase) { + (void)pmix_progress_thread_stop("PSENSOR"); + } + + /* Close all remaining available components */ + return pmix_mca_base_framework_components_close(&pmix_psensor_base_framework, NULL); +} + +/** + * Function for finding and opening either all MCA components, or the one + * that was specifically requested via a MCA parameter. + */ +static int pmix_psensor_base_open(pmix_mca_base_open_flag_t flags) +{ + /* construct the list of modules */ + PMIX_CONSTRUCT(&pmix_psensor_base.actives, pmix_list_t); + + if (use_separate_thread) { + /* create an event base and progress thread for us */ + if (NULL == (pmix_psensor_base.evbase = pmix_progress_thread_init("PSENSOR"))) { + return PMIX_ERROR; + } + + } else { + pmix_psensor_base.evbase = pmix_globals.evbase; + } + + /* Open up all available components */ + return pmix_mca_base_framework_components_open(&pmix_psensor_base_framework, flags); +} + +PMIX_MCA_BASE_FRAMEWORK_DECLARE(pmix, psensor, "PMIx Monitoring Sensors", + pmix_psensor_register, + pmix_psensor_base_open, pmix_psensor_base_close, + mca_psensor_base_static_components, 0); + +PMIX_CLASS_INSTANCE(pmix_psensor_active_module_t, + pmix_list_item_t, + NULL, NULL); diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_select.c b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_select.c new file mode 100644 index 00000000000..4a1f1f0c2a5 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_select.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + +#include + +#include "src/mca/mca.h" +#include "src/mca/base/base.h" + +#include "src/mca/psensor/base/base.h" + +static bool selected = false; + +/* Function for selecting a prioritized list of components + * from all those that are available. */ +int pmix_psensor_base_select(void) +{ + pmix_mca_base_component_list_item_t *cli = NULL; + pmix_psensor_base_component_t *component = NULL; + pmix_psensor_active_module_t *newactive, *active; + pmix_mca_base_module_t *mod; + int pri; + bool inserted; + + if (selected) { + /* ensure we don't do this twice */ + return PMIX_SUCCESS; + } + selected = true; + + /* Query all available components and ask if they have a module */ + PMIX_LIST_FOREACH(cli, &pmix_psensor_base_framework.framework_components, pmix_mca_base_component_list_item_t) { + component = (pmix_psensor_base_component_t *) cli->cli_component; + + pmix_output_verbose(5, pmix_psensor_base_framework.framework_output, + "mca:psensor:select: checking available component %s", + component->base.pmix_mca_component_name); + + /* get the module for this component */ + if (PMIX_SUCCESS != component->base.pmix_mca_query_component(&mod, &pri)) { + continue; + } + + /* add to our prioritized list of available actives */ + newactive = PMIX_NEW(pmix_psensor_active_module_t); + newactive->priority = pri; + newactive->component = component; + newactive->module = (pmix_psensor_base_module_t*)mod; + + /* maintain priority order */ + inserted = false; + PMIX_LIST_FOREACH(active, &pmix_psensor_base.actives, pmix_psensor_active_module_t) { + if (newactive->priority > active->priority) { + pmix_list_insert_pos(&pmix_psensor_base.actives, + (pmix_list_item_t*)active, &newactive->super); + inserted = true; + break; + } + } + if (!inserted) { + /* must be lowest priority - add to end */ + pmix_list_append(&pmix_psensor_base.actives, &newactive->super); + } + } + + if (4 < pmix_output_get_verbosity(pmix_psensor_base_framework.framework_output)) { + pmix_output(0, "Final PSENSOR priorities"); + /* show the prioritized list */ + PMIX_LIST_FOREACH(active, &pmix_psensor_base.actives, pmix_psensor_active_module_t) { + pmix_output(0, "\tPSENSOR: %s Priority: %d", + active->component->base.pmix_mca_component_name, active->priority); + } + } + + return PMIX_SUCCESS;; +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_stubs.c b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_stubs.c new file mode 100644 index 00000000000..c24b57d6986 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/base/psensor_base_stubs.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include +#include + +#include "src/util/error.h" + +#include "src/mca/psensor/base/base.h" + +pmix_status_t pmix_psensor_base_start(pmix_peer_t *requestor, pmix_status_t error, + const pmix_info_t *monitor, + const pmix_info_t directives[], size_t ndirs) +{ + pmix_psensor_active_module_t *mod; + pmix_status_t rc; + + pmix_output_verbose(5, pmix_psensor_base_framework.framework_output, + "%s:%d sensor:base: starting sensors", + pmix_globals.myid.nspace, pmix_globals.myid.rank); + + /* call the start function of all modules in priority order */ + PMIX_LIST_FOREACH(mod, &pmix_psensor_base.actives, pmix_psensor_active_module_t) { + if (NULL != mod->module->start) { + rc = mod->module->start(requestor, error, monitor, directives, ndirs); + if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) { + return rc; + } + } + } + + return PMIX_SUCCESS; +} + +pmix_status_t pmix_psensor_base_stop(pmix_peer_t *requestor, + char *id) +{ + pmix_psensor_active_module_t *mod; + pmix_status_t rc; + + pmix_output_verbose(5, pmix_psensor_base_framework.framework_output, + "%s:%d sensor:base: stopping sensors", + pmix_globals.myid.nspace, pmix_globals.myid.rank); + + /* call the stop function of all modules in priority order */ + PMIX_LIST_FOREACH(mod, &pmix_psensor_base.actives, pmix_psensor_active_module_t) { + if (NULL != mod->module->stop) { + rc = mod->module->stop(requestor, id); + if (PMIX_SUCCESS != rc && PMIX_ERR_TAKE_NEXT_OPTION != rc) { + return rc; + } + } + } + + return PMIX_SUCCESS; +} diff --git a/orte/mca/sensor/file/Makefile.am b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/Makefile.am similarity index 50% rename from orte/mca/sensor/file/Makefile.am rename to opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/Makefile.am index 2d0640ab433..30dce46e38e 100644 --- a/orte/mca/sensor/file/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/Makefile.am @@ -1,37 +1,37 @@ # -# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ -# +# # Additional copyrights may follow -# +# # $HEADER$ # -dist_ompidata_DATA = help-orte-sensor-file.txt +dist_pmixdata_DATA = help-pmix-psensor-file.txt sources = \ - sensor_file.c \ - sensor_file.h \ - sensor_file_component.c + psensor_file.c \ + psensor_file.h \ + psensor_file_component.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la # (for static builds). -if MCA_BUILD_orte_sensor_file_DSO +if MCA_BUILD_pmix_psensor_file_DSO component_noinst = -component_install = mca_sensor_file.la +component_install = mca_psensor_file.la else -component_noinst = libmca_sensor_file.la +component_noinst = libmca_psensor_file.la component_install = endif -mcacomponentdir = $(ompilibdir) +mcacomponentdir = $(pmixlibdir) mcacomponent_LTLIBRARIES = $(component_install) -mca_sensor_file_la_SOURCES = $(sources) -mca_sensor_file_la_LDFLAGS = -module -avoid-version +mca_psensor_file_la_SOURCES = $(sources) +mca_psensor_file_la_LDFLAGS = -module -avoid-version noinst_LTLIBRARIES = $(component_noinst) -libmca_sensor_file_la_SOURCES =$(sources) -libmca_sensor_file_la_LDFLAGS = -module -avoid-version +libmca_psensor_file_la_SOURCES =$(sources) +libmca_psensor_file_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/sensor/file/help-orte-sensor-file.txt b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/help-pmix-psensor-file.txt similarity index 98% rename from orte/mca/sensor/file/help-orte-sensor-file.txt rename to opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/help-pmix-psensor-file.txt index 321c6cd7711..98fd3a010c6 100644 --- a/orte/mca/sensor/file/help-orte-sensor-file.txt +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/help-pmix-psensor-file.txt @@ -4,9 +4,9 @@ # # Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ -# +# # Additional copyrights may follow -# +# # $HEADER$ # # This is the US/English general help file for the file sensor diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.c b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.c new file mode 100644 index 00000000000..5280c640e12 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include + +#include +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#include +#include +#include +#ifdef HAVE_TIME_H +#include +#endif +#include +#include + +#include "src/class/pmix_list.h" +#include "src/include/pmix_globals.h" +#include "src/util/error.h" +#include "src/util/output.h" +#include "src/util/show_help.h" + +#include "src/mca/psensor/base/base.h" +#include "psensor_file.h" + +/* declare the API functions */ +static pmix_status_t start(pmix_peer_t *requestor, pmix_status_t error, + const pmix_info_t *monitor, + const pmix_info_t directives[], size_t ndirs); +static pmix_status_t stop(pmix_peer_t *requestor, char *id); + +/* instantiate the module */ +pmix_psensor_base_module_t pmix_psensor_file_module = { + .start = start, + .stop = stop +}; + +/* define a tracking object */ +typedef struct { + pmix_list_item_t super; + pmix_peer_t *requestor; + char *id; + bool event_active; + pmix_event_t ev; + pmix_event_t cdev; + struct timeval tv; + int tick; + char *file; + bool file_size; + bool file_access; + bool file_mod; + size_t last_size; + time_t last_access; + time_t last_mod; + uint32_t ndrops; + uint32_t nmisses; + pmix_status_t error; + pmix_data_range_t range; + pmix_info_t *info; + size_t ninfo; +} file_tracker_t; +static void ft_constructor(file_tracker_t *ft) +{ + ft->requestor = NULL; + ft->id = NULL; + ft->event_active = false; + ft->tv.tv_sec = 0; + ft->tv.tv_usec = 0; + ft->tick = 0; + ft->file_size = false; + ft->file_access = false; + ft->file_mod = false; + ft->last_size = 0; + ft->last_access = 0; + ft->last_mod = 0; + ft->ndrops = 0; + ft->nmisses = 0; + ft->error = PMIX_SUCCESS; + ft->range = PMIX_RANGE_NAMESPACE; + ft->info = NULL; + ft->ninfo = 0; +} +static void ft_destructor(file_tracker_t *ft) +{ + if (NULL != ft->requestor) { + PMIX_RELEASE(ft->requestor); + } + if (NULL != ft->id) { + free(ft->id); + } + if (ft->event_active) { + pmix_event_del(&ft->ev); + } + if (NULL != ft->file) { + free(ft->file); + } + if (NULL != ft->info) { + PMIX_INFO_FREE(ft->info, ft->ninfo); + } +} +PMIX_CLASS_INSTANCE(file_tracker_t, + pmix_list_item_t, + ft_constructor, ft_destructor); + +/* define a local caddy */ +typedef struct { + pmix_object_t super; + pmix_event_t ev; + pmix_peer_t *requestor; + char *id; +} file_caddy_t; +static void cd_con(file_caddy_t *p) +{ + p->requestor = NULL; + p->id = NULL; +} +static void cd_des(file_caddy_t *p) +{ + if (NULL != (p->requestor)) { + PMIX_RELEASE(p->requestor); + } + if (NULL != p->id) { + free(p->id); + } +} +PMIX_CLASS_INSTANCE(file_caddy_t, + pmix_object_t, + cd_con, cd_des); + +static void file_sample(int sd, short args, void *cbdata); + +static void add_tracker(int sd, short flags, void *cbdata) +{ + file_tracker_t *ft = (file_tracker_t*)cbdata; + + /* add the tracker to our list */ + pmix_list_append(&mca_psensor_file_component.trackers, &ft->super); + + /* setup the timer event */ + pmix_event_evtimer_set(pmix_psensor_base.evbase, &ft->ev, + file_sample, ft); + pmix_event_evtimer_add(&ft->ev, &ft->tv); + ft->event_active = true; +} + +/* + * Start monitoring of local processes + */ +static pmix_status_t start(pmix_peer_t *requestor, pmix_status_t error, + const pmix_info_t *monitor, + const pmix_info_t directives[], size_t ndirs) +{ + file_tracker_t *ft; + size_t n; + + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] checking file monitoring for requestor %s:%d", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + requestor->info->nptr->nspace, requestor->info->rank)); + + /* if they didn't ask to monitor a file, then nothing for us to do */ + if (0 != strcmp(monitor->key, PMIX_MONITOR_FILE)) { + return PMIX_ERR_TAKE_NEXT_OPTION; + } + + /* setup to track this monitoring operation */ + ft = PMIX_NEW(file_tracker_t); + PMIX_RETAIN(requestor); + ft->requestor = requestor; + ft->file = strdup(monitor->value.data.string); + + /* check the directives to see if what they want monitored */ + for (n=0; n < ndirs; n++) { + if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_SIZE)) { + ft->file_size = directives[n].value.data.flag; + } else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_ACCESS)) { + ft->file_access = directives[n].value.data.flag; + } else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_MODIFY)) { + ft->file_mod = directives[n].value.data.flag; + } else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_DROPS)) { + ft->ndrops = directives[n].value.data.uint32; + } else if (0 == strcmp(directives[n].key, PMIX_MONITOR_FILE_CHECK_TIME)) { + ft->tv.tv_sec = directives[n].value.data.uint32; + } else if (0 == strcmp(directives[n].key, PMIX_RANGE)) { + ft->range = directives[n].value.data.range; + } + } + + if (0 == ft->tv.tv_sec || + (!ft->file_size && !ft->file_access && !ft->file_mod)) { + /* didn't specify a sample rate, or what should be sampled */ + PMIX_RELEASE(ft); + return PMIX_ERR_BAD_PARAM; + } + + /* need to push into our event base to add this to our trackers */ + pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1, + EV_WRITE, add_tracker, ft); + pmix_event_active(&ft->cdev, EV_WRITE, 1); + + return PMIX_SUCCESS; +} + + +static void del_tracker(int sd, short flags, void *cbdata) +{ + file_caddy_t *cd = (file_caddy_t*)cbdata; + file_tracker_t *ft, *ftnext; + + /* remove the tracker from our list */ + PMIX_LIST_FOREACH_SAFE(ft, ftnext, &mca_psensor_file_component.trackers, file_tracker_t) { + if (ft->requestor != cd->requestor) { + continue; + } + if (NULL == cd->id || + (NULL != ft->id && 0 == strcmp(ft->id, cd->id))) { + pmix_list_remove_item(&mca_psensor_file_component.trackers, &ft->super); + PMIX_RELEASE(ft); + } + } + PMIX_RELEASE(cd); +} + +static pmix_status_t stop(pmix_peer_t *requestor, char *id) +{ + file_caddy_t *cd; + + cd = PMIX_NEW(file_caddy_t); + PMIX_RETAIN(requestor); + cd->requestor = requestor; + cd->id = strdup(id); + + /* need to push into our event base to add this to our trackers */ + pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1, + EV_WRITE, del_tracker, cd); + pmix_event_active(&cd->ev, EV_WRITE, 1); + + return PMIX_SUCCESS; +} + +static void opcbfunc(pmix_status_t status, void *cbdata) +{ + file_tracker_t *ft = (file_tracker_t*)cbdata; + + PMIX_RELEASE(ft); +} + +static void file_sample(int sd, short args, void *cbdata) +{ + file_tracker_t *ft = (file_tracker_t*)cbdata; + struct stat buf; + pmix_status_t rc; + pmix_proc_t source; + + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] sampling file %s", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + ft->file)); + + /* stat the file and get its info */ + if (0 > stat(ft->file, &buf)) { + /* cannot stat file */ + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] could not stat %s", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + ft->file)); + /* re-add the timer, in case this file shows up */ + pmix_event_evtimer_add(&ft->ev, &ft->tv); + return; + } + + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] size %lu access %s\tmod %s", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + (unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime))); + + if (ft->file_size) { + if (buf.st_size == (int64_t)ft->last_size) { + ft->nmisses++; + } else { + ft->nmisses = 0; + ft->last_size = buf.st_size; + } + } else if (ft->file_access) { + if (buf.st_atime == ft->last_access) { + ft->nmisses++; + } else { + ft->nmisses = 0; + ft->last_access = buf.st_atime; + } + } else if (ft->file_mod) { + if (buf.st_mtime == ft->last_mod) { + ft->nmisses++; + } else { + ft->nmisses = 0; + ft->last_mod = buf.st_mtime; + } + } + + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] sampled file %s misses %d", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + ft->file, ft->nmisses)); + + if (ft->nmisses == ft->ndrops) { + if (4 < pmix_output_get_verbosity(pmix_psensor_base_framework.framework_output)) { + pmix_show_help("help-pmix-psensor-file.txt", "file-stalled", true, + ft->file, ft->last_size, ctime(&ft->last_access), ctime(&ft->last_mod)); + } + /* stop monitoring this client */ + pmix_list_remove_item(&mca_psensor_file_component.trackers, &ft->super); + /* generate an event */ + (void)strncpy(source.nspace, ft->requestor->info->nptr->nspace, PMIX_MAX_NSLEN); + source.rank = ft->requestor->info->rank; + rc = PMIx_Notify_event(PMIX_MONITOR_FILE_ALERT, &source, + ft->range, ft->info, ft->ninfo, opcbfunc, ft); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + return; + } + + /* re-add the timer */ + pmix_event_evtimer_add(&ft->ev, &ft->tv); +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.h b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.h new file mode 100644 index 00000000000..f78502cd8ec --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * File movement sensor + */ +#ifndef PMIX_PSENSOR_FILE_H +#define PMIX_PSENSOR_FILE_H + +#include + +#include "src/class/pmix_list.h" + +#include "src/mca/psensor/psensor.h" + +BEGIN_C_DECLS + +typedef struct { + pmix_psensor_base_component_t super; + pmix_list_t trackers; +} pmix_psensor_file_component_t; + +extern pmix_psensor_file_component_t mca_psensor_file_component; +extern pmix_psensor_base_module_t pmix_psensor_file_module; + + +END_C_DECLS + +#endif diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file_component.c b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file_component.c new file mode 100644 index 00000000000..2b751d71992 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/file/psensor_file_component.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + +#include "src/class/pmix_list.h" + +#include "src/mca/psensor/base/base.h" +#include "src/mca/psensor/file/psensor_file.h" + +/* + * Local functions + */ +static int psensor_file_open(void); +static int psensor_file_close(void); +static int psensor_file_query(pmix_mca_base_module_t **module, int *priority); + +pmix_psensor_file_component_t mca_psensor_file_component = { + .super = { + .base = { + PMIX_PSENSOR_BASE_VERSION_1_0_0, + + /* Component name and version */ + .pmix_mca_component_name = "file", + PMIX_MCA_BASE_MAKE_VERSION(component, + PMIX_MAJOR_VERSION, + PMIX_MINOR_VERSION, + PMIX_RELEASE_VERSION), + + /* Component open and close functions */ + psensor_file_open, /* component open */ + psensor_file_close, /* component close */ + psensor_file_query /* component query */ + }, + } +}; + + +static int psensor_file_open(void) +{ + PMIX_CONSTRUCT(&mca_psensor_file_component.trackers, pmix_list_t); + return PMIX_SUCCESS; +} + + +static int psensor_file_query(pmix_mca_base_module_t **module, int *priority) +{ + *priority = 20; /* irrelevant */ + *module = (pmix_mca_base_module_t *)&pmix_psensor_file_module; + return PMIX_SUCCESS; +} + +/** + * Close all subsystems. + */ + +static int psensor_file_close(void) +{ + PMIX_LIST_DESTRUCT(&mca_psensor_file_component.trackers); + return PMIX_SUCCESS; +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/Makefile.am b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/Makefile.am new file mode 100644 index 00000000000..df4fe0466a7 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/Makefile.am @@ -0,0 +1,38 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pmixdata_DATA = help-pmix-psensor-heartbeat.txt + +sources = \ + psensor_heartbeat.c \ + psensor_heartbeat.h \ + psensor_heartbeat_component.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_pmix_psensor_heartbeat_DSO +component_noinst = +component_install = mca_psensor_heartbeat.la +else +component_noinst = libmca_psensor_heartbeat.la +component_install = +endif + +mcacomponentdir = $(pmixlibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_psensor_heartbeat_la_SOURCES = $(sources) +mca_psensor_heartbeat_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_psensor_heartbeat_la_SOURCES =$(sources) +libmca_psensor_heartbeat_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/help-pmix-psensor-heartbeat.txt similarity index 98% rename from orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt rename to opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/help-pmix-psensor-heartbeat.txt index 4b27231a3ac..945e60badb4 100644 --- a/orte/mca/sensor/heartbeat/help-orte-sensor-heartbeat.txt +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/help-pmix-psensor-heartbeat.txt @@ -4,9 +4,9 @@ # # Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ -# +# # Additional copyrights may follow -# +# # $HEADER$ # # This is the US/English general help file for the memory usage sensor @@ -18,4 +18,3 @@ Node: %s Process rank: %s Memory used: %luGbytes Memory limit: %luGbytes - diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat.c b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat.c new file mode 100644 index 00000000000..7445ceb8d89 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * reserved. + * + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif /* HAVE_STRING_H */ +#include +#include +#include PMIX_EVENT_HEADER + +#include "src/util/argv.h" +#include "src/util/error.h" +#include "src/util/output.h" +#include "src/util/show_help.h" +#include "src/include/pmix_globals.h" +#include "src/mca/ptl/ptl.h" + +#include "src/mca/psensor/base/base.h" +#include "psensor_heartbeat.h" + +/* declare the API functions */ +static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error, + const pmix_info_t *monitor, + const pmix_info_t directives[], size_t ndirs); +static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id); + +/* instantiate the module */ +pmix_psensor_base_module_t pmix_psensor_heartbeat_module = { + .start = heartbeat_start, + .stop = heartbeat_stop +}; + +/* tracker object */ +typedef struct { + pmix_list_item_t super; + pmix_peer_t *requestor; + char *id; + bool event_active; + pmix_event_t ev; + pmix_event_t cdev; + struct timeval tv; + uint32_t nbeats; + uint32_t ndrops; + uint32_t nmissed; + pmix_status_t error; + pmix_data_range_t range; + pmix_info_t *info; + size_t ninfo; +} pmix_heartbeat_trkr_t; + +static void ft_constructor(pmix_heartbeat_trkr_t *ft) +{ + ft->requestor = NULL; + ft->id = NULL; + ft->event_active = false; + ft->tv.tv_sec = 0; + ft->tv.tv_usec = 0; + ft->nbeats = 0; + ft->ndrops = 0; + ft->nmissed = 0; + ft->error = PMIX_SUCCESS; + ft->range = PMIX_RANGE_NAMESPACE; + ft->info = NULL; + ft->ninfo = 0; +} +static void ft_destructor(pmix_heartbeat_trkr_t *ft) +{ + if (NULL != ft->requestor) { + PMIX_RELEASE(ft->requestor); + } + if (NULL != ft->id) { + free(ft->id); + } + if (ft->event_active) { + pmix_event_del(&ft->ev); + } + if (NULL != ft->info) { + PMIX_INFO_FREE(ft->info, ft->ninfo); + } +} +PMIX_CLASS_INSTANCE(pmix_heartbeat_trkr_t, + pmix_list_item_t, + ft_constructor, ft_destructor); + +/* define a local caddy */ +typedef struct { + pmix_object_t super; + pmix_event_t ev; + pmix_peer_t *requestor; + char *id; +} heartbeat_caddy_t; +static void cd_con(heartbeat_caddy_t *p) +{ + p->requestor = NULL; + p->id = NULL; +} +static void cd_des(heartbeat_caddy_t *p) +{ + if (NULL != (p->requestor)) { + PMIX_RELEASE(p->requestor); + } + if (NULL != p->id) { + free(p->id); + } +} +PMIX_CLASS_INSTANCE(heartbeat_caddy_t, + pmix_object_t, + cd_con, cd_des); + +typedef struct { + pmix_object_t super; + pmix_event_t ev; + pmix_peer_t *peer; +} pmix_psensor_beat_t; + +static void bcon(pmix_psensor_beat_t *p) +{ + p->peer = NULL; +} +static void bdes(pmix_psensor_beat_t *p) +{ + if (NULL != p->peer) { + PMIX_RELEASE(p->peer); + } +} +PMIX_CLASS_INSTANCE(pmix_psensor_beat_t, + pmix_object_t, + bcon, bdes); + +static void check_heartbeat(int fd, short dummy, void *arg); + +static void add_tracker(int sd, short flags, void *cbdata) +{ + pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata; + + /* add the tracker to our list */ + pmix_list_append(&mca_psensor_heartbeat_component.trackers, &ft->super); + + /* setup the timer event */ + pmix_event_evtimer_set(pmix_psensor_base.evbase, &ft->ev, + check_heartbeat, ft); + pmix_event_evtimer_add(&ft->ev, &ft->tv); + ft->event_active = true; +} + +static pmix_status_t heartbeat_start(pmix_peer_t *requestor, pmix_status_t error, + const pmix_info_t *monitor, + const pmix_info_t directives[], size_t ndirs) +{ + pmix_heartbeat_trkr_t *ft; + size_t n; + + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] checking heartbeat monitoring for requestor %s:%d", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + requestor->info->nptr->nspace, requestor->info->rank)); + + /* if they didn't ask for heartbeats, then nothing for us to do */ + if (0 != strcmp(monitor->key, PMIX_MONITOR_HEARTBEAT)) { + return PMIX_ERR_TAKE_NEXT_OPTION; + } + + /* setup to track this monitoring operation */ + ft = PMIX_NEW(pmix_heartbeat_trkr_t); + PMIX_RETAIN(requestor); + ft->requestor = requestor; + ft->error = error; + + /* check the directives to see what they want monitored */ + for (n=0; n < ndirs; n++) { + if (0 == strcmp(directives[n].key, PMIX_MONITOR_HEARTBEAT_TIME)) { + ft->tv.tv_sec = directives[n].value.data.uint32; + } else if (0 == strcmp(directives[n].key, PMIX_MONITOR_HEARTBEAT_DROPS)) { + ft->ndrops = directives[n].value.data.uint32; + } else if (0 == strcmp(directives[n].key, PMIX_RANGE)) { + ft->range = directives[n].value.data.range; + } + } + + if (0 == ft->tv.tv_sec) { + /* didn't specify a sample rate, or what should be sampled */ + PMIX_RELEASE(ft); + return PMIX_ERR_BAD_PARAM; + } + + /* need to push into our event base to add this to our trackers */ + pmix_event_assign(&ft->cdev, pmix_psensor_base.evbase, -1, + EV_WRITE, add_tracker, ft); + pmix_event_active(&ft->cdev, EV_WRITE, 1); + + return PMIX_SUCCESS; +} + +static void del_tracker(int sd, short flags, void *cbdata) +{ + heartbeat_caddy_t *cd = (heartbeat_caddy_t*)cbdata; + pmix_heartbeat_trkr_t *ft, *ftnext; + + /* remove the tracker from our list */ + PMIX_LIST_FOREACH_SAFE(ft, ftnext, &mca_psensor_heartbeat_component.trackers, pmix_heartbeat_trkr_t) { + if (ft->requestor != cd->requestor) { + continue; + } + if (NULL == cd->id || + (NULL != ft->id && 0 == strcmp(ft->id, cd->id))) { + pmix_list_remove_item(&mca_psensor_heartbeat_component.trackers, &ft->super); + PMIX_RELEASE(ft); + } + } + PMIX_RELEASE(cd); +} + +static pmix_status_t heartbeat_stop(pmix_peer_t *requestor, char *id) +{ + heartbeat_caddy_t *cd; + + cd = PMIX_NEW(heartbeat_caddy_t); + PMIX_RETAIN(requestor); + cd->requestor = requestor; + cd->id = strdup(id); + + /* need to push into our event base to add this to our trackers */ + pmix_event_assign(&cd->ev, pmix_psensor_base.evbase, -1, + EV_WRITE, del_tracker, cd); + pmix_event_active(&cd->ev, EV_WRITE, 1); + + return PMIX_SUCCESS; +} + +static void opcbfunc(pmix_status_t status, void *cbdata) +{ + pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata; + + PMIX_RELEASE(ft); +} + +/* this function automatically gets periodically called + * by the event library so we can check on the state + * of the various procs we are monitoring + */ +static void check_heartbeat(int fd, short dummy, void *cbdata) +{ + pmix_heartbeat_trkr_t *ft = (pmix_heartbeat_trkr_t*)cbdata; + pmix_status_t rc; + pmix_proc_t source; + + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] sensor:check_heartbeat for proc %s:%d", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + ft->requestor->info->nptr->nspace, ft->requestor->info->rank)); + + if (0 == ft->nbeats) { + /* no heartbeat recvd in last window */ + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] sensor:check_heartbeat failed for proc %s:%d", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + ft->requestor->info->nptr->nspace, ft->requestor->info->rank)); + /* stop monitoring this client */ + pmix_list_remove_item(&mca_psensor_heartbeat_component.trackers, &ft->super); + /* generate an event */ + (void)strncpy(source.nspace, ft->requestor->info->nptr->nspace, PMIX_MAX_NSLEN); + source.rank = ft->requestor->info->rank; + rc = PMIx_Notify_event(PMIX_MONITOR_HEARTBEAT_ALERT, &source, + ft->range, ft->info, ft->ninfo, opcbfunc, ft); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + } + return; + } else { + PMIX_OUTPUT_VERBOSE((1, pmix_psensor_base_framework.framework_output, + "[%s:%d] sensor:check_heartbeat detected %d beats for proc %s:%d", + pmix_globals.myid.nspace, pmix_globals.myid.rank, ft->nbeats, + ft->requestor->info->nptr->nspace, ft->requestor->info->rank)); + } + /* reset for next period */ + ft->nbeats = 0; + + /* reset the timer */ + pmix_event_evtimer_add(&ft->ev, &ft->tv); +} + +static void add_beat(int sd, short args, void *cbdata) +{ + pmix_psensor_beat_t *b = (pmix_psensor_beat_t*)cbdata; + pmix_heartbeat_trkr_t *ft; + + /* find this peer in our trackers */ + PMIX_LIST_FOREACH(ft, &mca_psensor_heartbeat_component.trackers, pmix_heartbeat_trkr_t) { + if (ft->requestor == b->peer) { + /* increment the beat count */ + ++ft->nbeats; + break; + } + } + + PMIX_RELEASE(b); +} + +void pmix_psensor_heartbeat_recv_beats(struct pmix_peer_t *peer, + pmix_ptl_hdr_t *hdr, + pmix_buffer_t *buf, void *cbdata) +{ + pmix_psensor_beat_t *b; + + b = PMIX_NEW(pmix_psensor_beat_t); + PMIX_RETAIN(peer); + b->peer = peer; + + /* shift this to our thread for processing */ + pmix_event_assign(&b->ev, pmix_psensor_base.evbase, -1, + EV_WRITE, add_beat, b); + pmix_event_active(&b->ev, EV_WRITE, 1); +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat.h b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat.h new file mode 100644 index 00000000000..2f904b60359 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. + * + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Heartbeat sensor + */ +#ifndef PMIX_PSENSOR_HEARTBEAT_H +#define PMIX_PSENSOR_HEARTBEAT_H + +#include +#include + +#include "src/class/pmix_list.h" +#include "src/include/pmix_globals.h" +#include "src/mca/psensor/psensor.h" + +BEGIN_C_DECLS + +typedef struct { + pmix_psensor_base_component_t super; + pmix_list_t trackers; +} pmix_psensor_heartbeat_component_t; + +PMIX_EXPORT extern pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component; +extern pmix_psensor_base_module_t pmix_psensor_heartbeat_module; + +void pmix_psensor_heartbeat_recv_beats(struct pmix_peer_t *peer, + pmix_ptl_hdr_t *hdr, + pmix_buffer_t *buf, void *cbdata); + +END_C_DECLS + +#endif diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat_component.c b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat_component.c new file mode 100644 index 00000000000..e16a26a347c --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/heartbeat/psensor_heartbeat_component.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + +#include "src/mca/ptl/ptl.h" +#include "src/mca/psensor/base/base.h" +#include "src/mca/psensor/heartbeat/psensor_heartbeat.h" + +/* + * Local functions + */ + +static int heartbeat_open(void); +static int heartbeat_close(void); +static int heartbeat_query(pmix_mca_base_module_t **module, int *priority); + +pmix_psensor_heartbeat_component_t mca_psensor_heartbeat_component = { + .super = { + .base = { + PMIX_PSENSOR_BASE_VERSION_1_0_0, + + /* Component name and version */ + .pmix_mca_component_name = "heartbeat", + PMIX_MCA_BASE_MAKE_VERSION(component, + PMIX_MAJOR_VERSION, + PMIX_MINOR_VERSION, + PMIX_RELEASE_VERSION), + + /* Component open and close functions */ + heartbeat_open, /* component open */ + heartbeat_close, /* component close */ + heartbeat_query /* component query */ + } + } +}; + + +/** + * component open/close/init function + */ +static int heartbeat_open(void) +{ + PMIX_CONSTRUCT(&mca_psensor_heartbeat_component.trackers, pmix_list_t); + + /* setup to receive heartbeats */ + pmix_ptl.recv(pmix_globals.mypeer, pmix_psensor_heartbeat_recv_beats, PMIX_PTL_TAG_HEARTBEAT); + + return PMIX_SUCCESS; +} + + +static int heartbeat_query(pmix_mca_base_module_t **module, int *priority) +{ + *priority = 5; // irrelevant + *module = (pmix_mca_base_module_t *)&pmix_psensor_heartbeat_module; + return PMIX_SUCCESS; +} + +/** + * Close all subsystems. + */ + +static int heartbeat_close(void) +{ + /* cancel our persistent recv */ + pmix_ptl.cancel(pmix_globals.mypeer, PMIX_PTL_TAG_HEARTBEAT); + + PMIX_LIST_DESTRUCT(&mca_psensor_heartbeat_component.trackers); + + return PMIX_SUCCESS; +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/psensor.h b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/psensor.h new file mode 100644 index 00000000000..e1c019e388c --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/psensor/psensor.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * @file: + * + */ + +#ifndef PMIX_PSENSOR_H_ +#define PMIX_PSENSOR_H_ + +#include + +#include "src/class/pmix_list.h" +#include "src/mca/mca.h" +#include "src/include/pmix_globals.h" + +BEGIN_C_DECLS + +/* + * Component functions - all MUST be provided! + */ + +/* start a sensor operation: + * + * requestor - the process requesting this operation + * + * monitor - a PMIx attribute specifying what is to be monitored + * + * directives - an array of pmix_info_t specifying relevant limits on values, and action + * to be taken when limits exceeded. Can include + * user-provided "id" string */ +typedef pmix_status_t (*pmix_psensor_base_module_start_fn_t)(pmix_peer_t *requestor, pmix_status_t error, + const pmix_info_t *monitor, + const pmix_info_t directives[], size_t ndirs); + +/* stop a sensor operation: + * + * requestor - the process requesting this operation + * + * id - the "id" string provided by the user at the time the + * affected monitoring operation was started. A NULL indicates + * that all operations started by this requestor are to + * be terminated */ +typedef pmix_status_t (*pmix_psensor_base_module_stop_fn_t)(pmix_peer_t *requestor, + char *id); + +/* API module */ +/* + * Ver 1.0 + */ +typedef struct pmix_psensor_base_module_1_0_0_t { + pmix_psensor_base_module_start_fn_t start; + pmix_psensor_base_module_stop_fn_t stop; +} pmix_psensor_base_module_t; + +/* + * the standard component data structure + */ +typedef struct pmix_psensor_base_component_1_0_0_t { + pmix_mca_base_component_t base; + pmix_mca_base_component_data_t data; +} pmix_psensor_base_component_t; + + + +/* + * Macro for use in components that are of type sensor v1.0.0 + */ +#define PMIX_PSENSOR_BASE_VERSION_1_0_0 \ + PMIX_MCA_BASE_VERSION_1_0_0("psensor", 1, 0, 0) + +/* Global structure for accessing sensor functions + */ +PMIX_EXPORT extern pmix_psensor_base_module_t pmix_psensor; /* holds API function pointers */ + +END_C_DECLS + +#endif /* MCA_SENSOR_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/base.h b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/base.h index a99e277f5f0..ac92ed9dc97 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/base.h +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/base.h @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -73,6 +73,7 @@ struct pmix_ptl_globals_t { pmix_list_t actives; bool initialized; pmix_list_t posted_recvs; // list of pmix_ptl_posted_recv_t + pmix_list_t unexpected_msgs; int stop_thread[2]; bool listen_thread_active; pmix_list_t listeners; @@ -93,6 +94,11 @@ PMIX_EXPORT pmix_status_t pmix_ptl_stub_send_oneway(struct pmix_peer_t *peer, pmix_ptl_tag_t tag); PMIX_EXPORT pmix_status_t pmix_ptl_stub_connect_to_peer(struct pmix_peer_t *peer, pmix_info_t info[], size_t ninfo); +PMIX_EXPORT pmix_status_t pmix_ptl_stub_register_recv(struct pmix_peer_t *peer, + pmix_ptl_cbfunc_t cbfunc, + pmix_ptl_tag_t tag); +PMIX_EXPORT pmix_status_t pmix_ptl_stub_cancel_recv(struct pmix_peer_t *peer, + pmix_ptl_tag_t tag); PMIX_EXPORT pmix_status_t pmix_ptl_base_start_listening(pmix_info_t *info, size_t ninfo); PMIX_EXPORT void pmix_ptl_base_stop_listening(void); diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_frame.c b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_frame.c index 00799c46082..c17029d46f8 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_frame.c +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_frame.c @@ -61,6 +61,8 @@ pmix_ptl_API_t pmix_ptl = { .send_recv = pmix_ptl_stub_send_recv, .send_oneway = pmix_ptl_stub_send_oneway, .connect_to_peer = pmix_ptl_stub_connect_to_peer, + .recv = pmix_ptl_stub_register_recv, + .cancel = pmix_ptl_stub_cancel_recv, .start_listening = pmix_ptl_base_start_listening, .stop_listening = pmix_ptl_base_stop_listening }; @@ -88,6 +90,7 @@ static pmix_status_t pmix_ptl_close(void) /* the components will cleanup when closed */ PMIX_LIST_DESTRUCT(&pmix_ptl_globals.actives); PMIX_LIST_DESTRUCT(&pmix_ptl_globals.posted_recvs); + PMIX_LIST_DESTRUCT(&pmix_ptl_globals.unexpected_msgs); PMIX_LIST_DESTRUCT(&pmix_ptl_globals.listeners); return pmix_mca_base_framework_components_close(&pmix_ptl_base_framework, NULL); @@ -99,6 +102,7 @@ static pmix_status_t pmix_ptl_open(pmix_mca_base_open_flag_t flags) pmix_ptl_globals.initialized = true; PMIX_CONSTRUCT(&pmix_ptl_globals.actives, pmix_list_t); PMIX_CONSTRUCT(&pmix_ptl_globals.posted_recvs, pmix_list_t); + PMIX_CONSTRUCT(&pmix_ptl_globals.unexpected_msgs, pmix_list_t); pmix_ptl_globals.listen_thread_active = false; PMIX_CONSTRUCT(&pmix_ptl_globals.listeners, pmix_list_t); pmix_client_globals.myserver.sd = -1; diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c index 88f9bca94a1..705d7861ab7 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c @@ -46,7 +46,7 @@ #include "src/mca/ptl/base/base.h" -static uint32_t current_tag = 1; // 0 is reserved for system purposes +static uint32_t current_tag = PMIX_PTL_TAG_DYNAMIC; static void _notify_complete(pmix_status_t status, void *cbdata) { @@ -182,7 +182,7 @@ static pmix_status_t send_msg(int sd, pmix_ptl_send_t *msg) } else { iov_count = 1; } -retry: + retry: rc = writev(sd, iov, iov_count); if (PMIX_LIKELY(rc == remain)) { /* we successfully sent the header and the msg data if any */ @@ -541,16 +541,16 @@ void pmix_ptl_base_send_recv(int fd, short args, void *cbdata) return; } - /* set the tag */ - tag = current_tag++; + /* take the next tag in the sequence */ + current_tag++; + if (UINT32_MAX == current_tag ) { + current_tag = PMIX_PTL_TAG_DYNAMIC; + } + tag = current_tag; if (NULL != ms->cbfunc) { /* if a callback msg is expected, setup a recv for it */ req = PMIX_NEW(pmix_ptl_posted_recv_t); - /* take the next tag in the sequence */ - if (UINT32_MAX == current_tag ) { - current_tag = 1; - } req->tag = tag; req->cbfunc = ms->cbfunc; req->cbdata = ms->cbdata; @@ -617,23 +617,29 @@ void pmix_ptl_base_process_msg(int fd, short flags, void *cbdata) buf.pack_ptr = ((char*)buf.base_ptr) + buf.bytes_used; } msg->data = NULL; // protect the data region - if (NULL != rcv->cbfunc) { - rcv->cbfunc(msg->peer, &msg->hdr, &buf, rcv->cbdata); - } + rcv->cbfunc(msg->peer, &msg->hdr, &buf, rcv->cbdata); PMIX_DESTRUCT(&buf); // free's the msg data - /* also done with the recv, if not a wildcard or the error tag */ - if (UINT32_MAX != rcv->tag && 0 != rcv->tag) { - pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super); - PMIX_RELEASE(rcv); - } - PMIX_RELEASE(msg); - return; } + /* done with the recv if it is a dynamic tag */ + if (PMIX_PTL_TAG_DYNAMIC <= rcv->tag && UINT_MAX != rcv->tag) { + pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super); + PMIX_RELEASE(rcv); + } + PMIX_RELEASE(msg); + return; } } - /* we get here if no matching recv was found - this is an error */ - pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag); - PMIX_RELEASE(msg); - PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete); + /* if the tag in this message is above the dynamic marker, then + * that is an error */ + if (PMIX_PTL_TAG_DYNAMIC <= msg->hdr.tag) { + pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag); + PMIX_RELEASE(msg); + PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete); + return; + } + + /* it is possible that someone may post a recv for this message + * at some point, so we have to hold onto it */ + pmix_list_append(&pmix_ptl_globals.unexpected_msgs, &msg->super); } diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_stubs.c b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_stubs.c index a82d4112e60..f13fde1bd78 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_stubs.c +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_stubs.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -105,3 +105,92 @@ pmix_status_t pmix_ptl_stub_connect_to_peer(struct pmix_peer_t *peer, return PMIX_ERR_UNREACH; } + +static void post_recv(int fd, short args, void *cbdata) +{ + pmix_ptl_posted_recv_t *req = (pmix_ptl_posted_recv_t*)cbdata; + pmix_ptl_recv_t *msg, *nmsg; + pmix_buffer_t buf; + + pmix_output_verbose(5, pmix_globals.debug_output, + "posting recv on tag %d", req->tag); + + /* add it to the list of recvs */ + pmix_list_append(&pmix_ptl_globals.posted_recvs, &req->super); + + /* now check the unexpected msg queue to see if we already + * recvd something for it */ + PMIX_LIST_FOREACH_SAFE(msg, nmsg, &pmix_ptl_globals.unexpected_msgs, pmix_ptl_recv_t) { + if (msg->hdr.tag == req->tag || UINT_MAX == req->tag) { + if (NULL != req->cbfunc) { + /* construct and load the buffer */ + PMIX_CONSTRUCT(&buf, pmix_buffer_t); + if (NULL != msg->data) { + buf.base_ptr = (char*)msg->data; + buf.bytes_allocated = buf.bytes_used = msg->hdr.nbytes; + buf.unpack_ptr = buf.base_ptr; + buf.pack_ptr = ((char*)buf.base_ptr) + buf.bytes_used; + } + msg->data = NULL; // protect the data region + req->cbfunc(msg->peer, &msg->hdr, &buf, req->cbdata); + PMIX_DESTRUCT(&buf); // free's the msg data + } + pmix_list_remove_item(&pmix_ptl_globals.unexpected_msgs, &msg->super); + PMIX_RELEASE(msg); + } + } +} + +pmix_status_t pmix_ptl_stub_register_recv(struct pmix_peer_t *peer, + pmix_ptl_cbfunc_t cbfunc, + pmix_ptl_tag_t tag) +{ + pmix_ptl_posted_recv_t *req; + + req = PMIX_NEW(pmix_ptl_posted_recv_t); + if (NULL == req) { + return PMIX_ERR_NOMEM; + } + req->tag = tag; + req->cbfunc = cbfunc; + /* have to push this into an event so we can add this + * to the list of posted recvs */ + pmix_event_assign(&(req->ev), pmix_globals.evbase, -1, + EV_WRITE, post_recv, req); + pmix_event_active(&(req->ev), EV_WRITE, 1); + return PMIX_SUCCESS; +} + +static void cancel_recv(int fd, short args, void *cbdata) +{ + pmix_ptl_posted_recv_t *req = (pmix_ptl_posted_recv_t*)cbdata; + pmix_ptl_posted_recv_t *rcv; + + PMIX_LIST_FOREACH(rcv, &pmix_ptl_globals.posted_recvs, pmix_ptl_posted_recv_t) { + if (rcv->tag == req->tag) { + pmix_list_remove_item(&pmix_ptl_globals.posted_recvs, &rcv->super); + PMIX_RELEASE(rcv); + PMIX_RELEASE(req); + return; + } + } + PMIX_RELEASE(req); +} + +pmix_status_t pmix_ptl_stub_cancel_recv(struct pmix_peer_t *peer, + pmix_ptl_tag_t tag) +{ + pmix_ptl_posted_recv_t *req; + + req = PMIX_NEW(pmix_ptl_posted_recv_t); + if (NULL == req) { + return PMIX_ERR_NOMEM; + } + req->tag = tag; + /* have to push this into an event so we can modify + * the list of posted recvs */ + pmix_event_assign(&(req->ev), pmix_globals.evbase, -1, + EV_WRITE, cancel_recv, req); + pmix_event_active(&(req->ev), EV_WRITE, 1); + return PMIX_SUCCESS; +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl.h b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl.h index 3681f8bb46c..f2f5ad6033f 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl.h +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl.h @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Mellanox Technologies, Inc. @@ -110,6 +110,15 @@ typedef pmix_status_t (*pmix_ptl_send_fn_t)(struct pmix_peer_t *peer, pmix_buffer_t *bfr, pmix_ptl_tag_t tag); +/* (ONE-WAY) register a persistent recv */ +typedef pmix_status_t (*pmix_ptl_recv_fn_t)(struct pmix_peer_t *peer, + pmix_ptl_cbfunc_t cbfunc, + pmix_ptl_tag_t tag); + +/* Cancel a persistent recv */ +typedef pmix_status_t (*pmix_ptl_cancel_fn_t)(struct pmix_peer_t *peer, + pmix_ptl_tag_t tag); + /* connect to a peer - this is a blocking function * to establish a connection to a peer. It assigns * the corresponding module to the peer's compat @@ -126,6 +135,8 @@ struct pmix_ptl_module_t { pmix_ptl_finalize_fn_t finalize; pmix_ptl_send_recv_fn_t send_recv; pmix_ptl_send_fn_t send; + pmix_ptl_recv_fn_t recv; + pmix_ptl_cancel_fn_t cancel; pmix_ptl_connect_to_peer_fn_t connect_to_peer; }; typedef struct pmix_ptl_module_t pmix_ptl_module_t; @@ -152,6 +163,8 @@ typedef struct { pmix_ptl_get_available_modules_fn_t get_available_modules; pmix_ptl_send_recv_fn_t send_recv; pmix_ptl_send_fn_t send_oneway; + pmix_ptl_recv_fn_t recv; + pmix_ptl_cancel_fn_t cancel; pmix_ptl_connect_to_peer_fn_t connect_to_peer; pmix_ptl_start_listening_fn_t start_listening; pmix_ptl_stop_listening_fn_t stop_listening; diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl_types.h b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl_types.h index 782a10779b6..e5571c35dbe 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl_types.h +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/ptl_types.h @@ -63,6 +63,16 @@ struct pmix_ptl_module_t; /**** MESSAGING STRUCTURES ****/ typedef uint32_t pmix_ptl_tag_t; +/* define a range of "reserved" tags - these + * are tags that are used for persistent recvs + * within the system */ +#define PMIX_PTL_TAG_NOTIFY 0 +#define PMIX_PTL_TAG_HEARTBEAT 1 + +/* define the start of dynamic tags that are + * assigned for send/recv operations */ +#define PMIX_PTL_TAG_DYNAMIC 100 + /* header for messages */ typedef struct { diff --git a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_params.c b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_params.c index eeea597f7d3..e2c60025bb8 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_params.c +++ b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_params.c @@ -21,7 +21,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,7 +37,6 @@ #include "src/util/timings.h" #if PMIX_ENABLE_TIMING -char *pmix_timing_sync_file = NULL; char *pmix_timing_output = NULL; bool pmix_timing_overhead = true; #endif @@ -56,16 +55,6 @@ pmix_status_t pmix_register_params(void) pmix_register_done = true; #if PMIX_ENABLE_TIMING - pmix_timing_sync_file = NULL; - (void) pmix_mca_base_var_register ("pmix", "pmix", NULL, "timing_sync_file", - "Clock synchronisation information generated by mpisync tool. You don't need to touch this if you use mpirun_prof tool.", - PMIX_MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - PMIX_INFO_LVL_9, PMIX_MCA_BASE_VAR_SCOPE_ALL, - &pmix_timing_sync_file); - if( pmix_timing_clocksync_read(pmix_timing_sync_file) ){ - pmix_output(0, "Cannot read file %s containing clock synchronisation information\n", pmix_timing_sync_file); - } - pmix_timing_output = NULL; (void) pmix_mca_base_var_register ("pmix", "pmix", NULL, "timing_output", "The name of output file for timing information. If this parameter is not set then output will be directed into PMIX debug channel.", diff --git a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_progress_threads.h b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_progress_threads.h index 9a09a049c3e..1dfb1df48b3 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_progress_threads.h +++ b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_progress_threads.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * @@ -13,6 +13,11 @@ #include "pmix_config.h" +#include +#include PMIX_EVENT_HEADER + +#include "src/include/types.h" + /** * Initialize a progress thread name; if a progress thread is not * already associated with that name, start a progress thread. diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c index dd37c289bac..ed445a4a927 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c @@ -990,11 +990,11 @@ static void _dmodex_req(int sd, short args, void *cbdata) * may not be a contribution */ if (PMIX_SUCCESS == (rc = pmix_hash_fetch(&nptr->server->myremote, info->rank, "modex", &val)) && NULL != val) { - data = val->data.bo.bytes; - sz = val->data.bo.size; - /* protect the data */ - val->data.bo.bytes = NULL; - val->data.bo.size = 0; + data = val->data.bo.bytes; + sz = val->data.bo.size; + /* protect the data */ + val->data.bo.bytes = NULL; + val->data.bo.size = 0; PMIX_VALUE_RELEASE(val); } @@ -1850,7 +1850,7 @@ static void _mdxcbfunc(int sd, short argc, void *cbdata) } finish_collective: - if(NULL != databuf) { + if (NULL != databuf) { PMIX_RELEASE(databuf); } /* setup the reply, starting with the returned status */ @@ -2345,6 +2345,18 @@ static pmix_status_t server_switchyard(pmix_peer_t *peer, uint32_t tag, return rc; } + if (PMIX_JOB_CONTROL_CMD == cmd) { + PMIX_PEER_CADDY(cd, peer, tag); + rc = pmix_server_job_ctrl(peer, buf, query_cbfunc, cd); + return rc; + } + + if (PMIX_MONITOR_CMD == cmd) { + PMIX_PEER_CADDY(cd, peer, tag); + rc = pmix_server_monitor(peer, buf, query_cbfunc, cd); + return rc; + } + return PMIX_ERR_NOT_SUPPORTED; } diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_get.c b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_get.c index 3b8490a9b54..278176ad725 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_get.c +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_get.c @@ -612,10 +612,10 @@ static void _process_dmdx_reply(int fd, short args, void *cbdata) } if (NULL == nptr) { -/* - * We may not have this namespace because someone asked about this namespace - * but there are not processses from it running on this host - */ + /* + * We may not have this namespace because someone asked about this namespace + * but there are not processses from it running on this host + */ nptr = PMIX_NEW(pmix_nspace_t); (void)strncpy(nptr->nspace, caddy->lcd->proc.nspace, PMIX_MAX_NSLEN); nptr->server = PMIX_NEW(pmix_server_nspace_t); @@ -628,8 +628,12 @@ static void _process_dmdx_reply(int fd, short args, void *cbdata) * store the data first so we can immediately satisfy any future * requests. Then, rather than duplicate the resolve code here, we * will let the pmix_pending_resolve function go ahead and retrieve - * it from the hash table */ - if (PMIX_SUCCESS == caddy->status) { + * it from the hash table. + * + * NOTE: A NULL data pointer indicates that the data has already + * been returned via completion of a background fence_nb operation. + * In this case, all we need to do is resolve the request */ + if (PMIX_SUCCESS == caddy->status && NULL != caddy->data) { if (caddy->lcd->proc.rank == PMIX_RANK_WILDCARD) { void * where = malloc(caddy->ndata); if (where) { diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c index ba9c749d037..5add656abf1 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c @@ -138,7 +138,6 @@ pmix_status_t pmix_server_commit(pmix_peer_t *peer, pmix_buffer_t *buf) pmix_nspace_t *nptr; pmix_rank_info_t *info; pmix_dmdx_remote_t *dcd, *dcdnext; - pmix_buffer_t *pbkt; pmix_value_t *val; char *data; size_t sz; @@ -236,16 +235,19 @@ pmix_status_t pmix_server_commit(pmix_peer_t *peer, pmix_buffer_t *buf) if (dcd->cd->proc.rank == info->rank) { /* we can now fulfill this request - collect the * remote/global data from this proc */ - pbkt = PMIX_NEW(pmix_buffer_t); /* get any remote contribution - note that there * may not be a contribution */ + data = NULL; + sz = 0; if (PMIX_SUCCESS == pmix_hash_fetch(&nptr->server->myremote, info->rank, "modex", &val) && NULL != val) { - PMIX_LOAD_BUFFER(pbkt, val->data.bo.bytes, val->data.bo.size); + data = val->data.bo.bytes; + sz = val->data.bo.size; + /* protect the data */ + val->data.bo.bytes = NULL; + val->data.bo.size = 0; PMIX_VALUE_RELEASE(val); } - PMIX_UNLOAD_BUFFER(pbkt, data, sz); - PMIX_RELEASE(pbkt); /* execute the callback */ dcd->cd->cbfunc(PMIX_SUCCESS, data, sz, dcd->cd->cbdata); if (NULL != data) { @@ -362,8 +364,6 @@ static pmix_server_trkr_t* new_tracker(pmix_proc_t *procs, return NULL; } - assert( NULL == get_tracker(procs, nprocs, type) ); - pmix_output_verbose(5, pmix_globals.debug_output, "adding new tracker with %d procs", (int)nprocs); @@ -466,7 +466,8 @@ pmix_status_t pmix_server_fence(pmix_server_caddy_t *cd, return rc; } pmix_output_verbose(2, pmix_globals.debug_output, - "recvd fence with %d procs", (int)nprocs); + "recvd fence from %s:%u with %d procs", + cd->peer->info->nptr->nspace, cd->peer->info->rank, (int)nprocs); /* there must be at least one as the client has to at least provide * their own namespace */ if (nprocs < 1) { @@ -1219,9 +1220,7 @@ void pmix_server_deregister_events(pmix_peer_t *peer, pmix_buffer_t *buf) { int32_t cnt; - pmix_status_t rc, *codes = NULL, *cdptr, maxcode = PMIX_MAX_ERR_CONSTANT; - pmix_info_t *info = NULL; - size_t ninfo=0, ncodes, ncds, n; + pmix_status_t rc, code; pmix_regevents_info_t *reginfo = NULL; pmix_regevents_info_t *reginfo_next; pmix_peer_events_info_t *prev; @@ -1229,34 +1228,11 @@ void pmix_server_deregister_events(pmix_peer_t *peer, pmix_output_verbose(2, pmix_globals.debug_output, "recvd deregister events"); - /* unpack the number of codes */ + /* unpack codes and process until done */ cnt=1; - if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &ncodes, &cnt, PMIX_SIZE))) { - /* it is okay if there aren't any - equivalent to a wildcard */ - ncodes = 0; - } - /* unpack the array of codes */ - if (0 < ncodes) { - codes = (pmix_status_t*)malloc(ncodes * sizeof(pmix_status_t)); - cnt=ncodes; - if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, codes, &cnt, PMIX_STATUS))) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - } - - /* find the event registration info so we can delete them */ - if (NULL == codes) { - cdptr = &maxcode; - ncds = 1; - } else { - cdptr = codes; - ncds = ncodes; - } - - for (n=0; n < ncds; n++) { + while (PMIX_SUCCESS == (rc = pmix_bfrop.unpack(buf, &code, &cnt, PMIX_STATUS))) { PMIX_LIST_FOREACH_SAFE(reginfo, reginfo_next, &pmix_server_globals.events, pmix_regevents_info_t) { - if (cdptr[n] == reginfo->code) { + if (code == reginfo->code) { /* found it - remove this peer from the list */ PMIX_LIST_FOREACH(prev, ®info->peers, pmix_peer_events_info_t) { if (prev->peer == peer) { @@ -1275,15 +1251,9 @@ void pmix_server_deregister_events(pmix_peer_t *peer, } } } - -cleanup: - if (NULL != codes) { - free(codes); - } - if (NULL != info) { - PMIX_INFO_FREE(info, ninfo); + if (PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + PMIX_ERROR_LOG(rc); } - return; } @@ -1562,6 +1532,144 @@ pmix_status_t pmix_server_alloc(pmix_peer_t *peer, return rc; } +pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer, + pmix_buffer_t *buf, + pmix_info_cbfunc_t cbfunc, + void *cbdata) +{ + int32_t cnt; + pmix_status_t rc; + pmix_query_caddy_t *cd; + pmix_proc_t proc; + + pmix_output_verbose(2, pmix_globals.debug_output, + "recvd job control request from client"); + + if (NULL == pmix_host_server.job_control) { + return PMIX_ERR_NOT_SUPPORTED; + } + + cd = PMIX_NEW(pmix_query_caddy_t); + cd->cbdata = cbdata; + + /* unpack the number of targets */ + cnt = 1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ntargets, &cnt, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + goto exit; + } + if (0 < cd->ntargets) { + PMIX_PROC_CREATE(cd->targets, cd->ntargets); + cnt = cd->ntargets; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->targets, &cnt, PMIX_PROC))) { + PMIX_ERROR_LOG(rc); + goto exit; + } + } + /* unpack the number of info objects */ + cnt = 1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ninfo, &cnt, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + goto exit; + } + /* unpack the info */ + if (0 < cd->ninfo) { + PMIX_INFO_CREATE(cd->info, cd->ninfo); + cnt = cd->ninfo; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->info, &cnt, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + goto exit; + } + } + + /* setup the requesting peer name */ + (void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN); + proc.rank = peer->info->rank; + + /* ask the host to execute the request */ + if (PMIX_SUCCESS != (rc = pmix_host_server.job_control(&proc, + cd->targets, cd->ntargets, + cd->info, cd->ninfo, + cbfunc, cd))) { + goto exit; + } + return PMIX_SUCCESS; + + exit: + PMIX_RELEASE(cd); + return rc; +} + +pmix_status_t pmix_server_monitor(pmix_peer_t *peer, + pmix_buffer_t *buf, + pmix_info_cbfunc_t cbfunc, + void *cbdata) +{ + int32_t cnt; + pmix_info_t monitor; + pmix_status_t rc, error; + pmix_query_caddy_t *cd; + pmix_proc_t proc; + + pmix_output_verbose(2, pmix_globals.debug_output, + "recvd monitor request from client"); + + if (NULL == pmix_host_server.monitor) { + return PMIX_ERR_NOT_SUPPORTED; + } + + cd = PMIX_NEW(pmix_query_caddy_t); + cd->cbdata = cbdata; + + /* unpack what is to be monitored */ + PMIX_INFO_CONSTRUCT(&monitor); + cnt = 1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &monitor, &cnt, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + goto exit; + } + + /* unpack the error code */ + cnt = 1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &error, &cnt, PMIX_STATUS))) { + PMIX_ERROR_LOG(rc); + goto exit; + } + + /* unpack the number of directives */ + cnt = 1; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, &cd->ninfo, &cnt, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + goto exit; + } + /* unpack the directives */ + if (0 < cd->ninfo) { + PMIX_INFO_CREATE(cd->info, cd->ninfo); + cnt = cd->ninfo; + if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, cd->info, &cnt, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + goto exit; + } + } + + /* setup the requesting peer name */ + (void)strncpy(proc.nspace, peer->info->nptr->nspace, PMIX_MAX_NSLEN); + proc.rank = peer->info->rank; + + /* ask the host to execute the request */ + if (PMIX_SUCCESS != (rc = pmix_host_server.monitor(&proc, &monitor, error, + cd->info, cd->ninfo, + cbfunc, cd))) { + goto exit; + } + return PMIX_SUCCESS; + + exit: + PMIX_INFO_DESTRUCT(&monitor); + PMIX_RELEASE(cd); + return rc; +} + /***** INSTANCE SERVER LIBRARY CLASSES *****/ static void tcon(pmix_server_trkr_t *t) { diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.h b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.h index faad880234b..f502cd33a35 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.h +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.h @@ -218,6 +218,16 @@ pmix_status_t pmix_server_alloc(pmix_peer_t *peer, pmix_info_cbfunc_t cbfunc, void *cbdata); +pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer, + pmix_buffer_t *buf, + pmix_info_cbfunc_t cbfunc, + void *cbdata); + +pmix_status_t pmix_server_monitor(pmix_peer_t *peer, + pmix_buffer_t *buf, + pmix_info_cbfunc_t cbfunc, + void *cbdata); + pmix_status_t pmix_server_event_recvd_from_client(pmix_peer_t *peer, pmix_buffer_t *buf, pmix_op_cbfunc_t cbfunc, diff --git a/opal/mca/pmix/pmix2x/pmix/src/util/compress.h b/opal/mca/pmix/pmix2x/pmix/src/util/compress.h index b07b0d2ea71..d81cff74ebb 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/util/compress.h +++ b/opal/mca/pmix/pmix2x/pmix/src/util/compress.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/pmix/pmix2x/pmix/src/util/error.c b/opal/mca/pmix/pmix2x/pmix/src/util/error.c index 0850e72edb2..d75bc2cd783 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/util/error.c +++ b/opal/mca/pmix/pmix2x/pmix/src/util/error.c @@ -56,6 +56,8 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum) return "INVALID-KEYVAL"; case PMIX_ERR_INVALID_NUM_PARSED: return "INVALID-NUM-PARSED"; + case PMIX_ERR_TAKE_NEXT_OPTION: + return "TAKE-NEXT-OPTION"; case PMIX_ERR_INVALID_ARGS: return "INVALID-ARGS"; @@ -157,6 +159,14 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum) return "PMIX_ERR_WILDCARD"; case PMIX_NOTIFY_ALLOC_COMPLETE: return "PMIX ALLOC OPERATION COMPLETE"; + case PMIX_JCTRL_CHECKPOINT: + return "PMIX JOB CONTROL CHECKPOINT"; + case PMIX_JCTRL_PREEMPT_ALERT: + return "PMIX PRE-EMPTION ALERT"; + case PMIX_MONITOR_HEARTBEAT_ALERT: + return "PMIX HEARTBEAT ALERT"; + case PMIX_MONITOR_FILE_ALERT: + return "PMIX FILE MONITOR ALERT"; case PMIX_SUCCESS: return "SUCCESS"; default: diff --git a/opal/mca/pmix/pmix2x/pmix/src/util/error.h b/opal/mca/pmix/pmix2x/pmix/src/util/error.h index b72cecf5180..1883c442e42 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/util/error.h +++ b/opal/mca/pmix/pmix2x/pmix/src/util/error.h @@ -37,6 +37,7 @@ #define PMIX_ERR_NETWORK_NOT_PARSEABLE (PMIX_INTERNAL_ERR_BASE - 33) #define PMIX_ERR_FILE_OPEN_FAILURE (PMIX_INTERNAL_ERR_BASE - 34) #define PMIX_ERR_FILE_READ_FAILURE (PMIX_INTERNAL_ERR_BASE - 35) +#define PMIX_ERR_TAKE_NEXT_OPTION (PMIX_INTERNAL_ERR_BASE - 36) #define PMIX_ERROR_LOG(r) \ do { \ diff --git a/opal/mca/pmix/pmix2x/pmix/test/Makefile.am b/opal/mca/pmix/pmix2x/pmix/test/Makefile.am index 71da45a1e12..1d1a0b8f46f 100644 --- a/opal/mca/pmix/pmix2x/pmix/test/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/test/Makefile.am @@ -11,7 +11,7 @@ # All rights reserved. # Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. -# Copyright (c) 2013-2016 Intel, Inc. All rights reserved +# Copyright (c) 2013-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -32,7 +32,12 @@ headers = test_common.h cli_stages.h server_callbacks.h utils.h test_fence.h \ AM_CPPFLAGS = -I$(top_builddir)/src -I$(top_builddir)/src/include -I$(top_builddir)/src/api noinst_SCRIPTS = pmix_client_otheruser.sh -noinst_PROGRAMS = pmi_client pmi2_client +noinst_PROGRAMS = + +if WANT_PMIX_BACKWARD +noinst_PROGRAMS += pmi_client pmi2_client +endif + if !WANT_HIDDEN noinst_PROGRAMS += pmix_test pmix_client pmix_regex endif @@ -43,6 +48,7 @@ pmix_test_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) pmix_test_LDADD = \ $(top_builddir)/src/libpmix.la +if WANT_PMIX_BACKWARD pmi_client_SOURCES = $(headers) \ pmi_client.c pmi_client_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) @@ -54,6 +60,7 @@ pmi2_client_SOURCES = $(headers) \ pmi2_client_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) pmi2_client_LDADD = \ $(top_builddir)/src/libpmix.la +endif pmix_client_SOURCES = $(headers) \ pmix_client.c test_fence.c test_common.c test_publish.c test_spawn.c \ diff --git a/opal/mca/pmix/pmix2x/pmix2x.c b/opal/mca/pmix/pmix2x/pmix2x.c index 10f6a5e7725..efa8047d266 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.c +++ b/opal/mca/pmix/pmix2x/pmix2x.c @@ -493,6 +493,12 @@ int pmix2x_convert_rc(pmix_status_t rc) case PMIX_QUERY_PARTIAL_SUCCESS: return OPAL_ERR_PARTIAL_SUCCESS; + case PMIX_MONITOR_HEARTBEAT_ALERT: + return OPAL_ERR_HEARTBEAT_ALERT; + + case PMIX_MONITOR_FILE_ALERT: + return OPAL_ERR_FILE_ALERT; + case PMIX_ERROR: return OPAL_ERROR; case PMIX_SUCCESS: @@ -1333,6 +1339,22 @@ static void pmix2x_log(opal_list_t *info, OBJ_RELEASE(cd); } +opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir) +{ + switch (dir) { + case PMIX_ALLOC_NEW: + return OPAL_PMIX_ALLOC_NEW; + case PMIX_ALLOC_EXTEND: + return OPAL_PMIX_ALLOC_EXTEND; + case PMIX_ALLOC_RELEASE: + return OPAL_PMIX_ALLOC_RELEASE; + case PMIX_ALLOC_REAQUIRE: + return OPAL_PMIX_ALLOC_REAQCUIRE; + default: + return OPAL_PMIX_ALLOC_UNDEF; + } +} + /**** INSTANTIATE INTERNAL CLASSES ****/ OBJ_CLASS_INSTANCE(opal_pmix2x_jobid_trkr_t, opal_list_item_t, @@ -1443,3 +1465,19 @@ static void tsdes(pmix2x_threadshift_t *p) OBJ_CLASS_INSTANCE(pmix2x_threadshift_t, opal_object_t, tscon, tsdes); + +static void dmcon(opal_pmix2x_dmx_trkr_t *p) +{ + p->nspace = NULL; + p->cbfunc = NULL; + p->cbdata = NULL; +} +static void dmdes(opal_pmix2x_dmx_trkr_t *p) +{ + if (NULL != p->nspace) { + free(p->nspace); + } +} +OBJ_CLASS_INSTANCE(opal_pmix2x_dmx_trkr_t, + opal_list_item_t, + dmcon, dmdes); diff --git a/opal/mca/pmix/pmix2x/pmix2x.h b/opal/mca/pmix/pmix2x/pmix2x.h index 541978e4826..63506b19f1f 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.h +++ b/opal/mca/pmix/pmix2x/pmix2x.h @@ -42,6 +42,7 @@ typedef struct { opal_list_t events; int cache_size; opal_list_t cache; + opal_list_t dmdx; } mca_pmix_pmix2x_component_t; OPAL_DECLSPEC extern mca_pmix_pmix2x_component_t mca_pmix_pmix2x_component; @@ -64,6 +65,14 @@ typedef struct { } opal_pmix2x_event_t; OBJ_CLASS_DECLARATION(opal_pmix2x_event_t); +typedef struct { + opal_list_item_t super; + char *nspace; + pmix_modex_cbfunc_t cbfunc; + void *cbdata; +} opal_pmix2x_dmx_trkr_t; +OBJ_CLASS_DECLARATION(opal_pmix2x_dmx_trkr_t); + typedef struct { opal_object_t super; pmix_status_t status; @@ -279,6 +288,8 @@ OPAL_MODULE_DECLSPEC void pmix2x_value_load(pmix_value_t *v, OPAL_MODULE_DECLSPEC int pmix2x_value_unload(opal_value_t *kv, const pmix_value_t *v); +OPAL_MODULE_DECLSPEC opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir); + END_C_DECLS #endif /* MCA_PMIX_EXTERNAL_H */ diff --git a/opal/mca/pmix/pmix2x/pmix2x_component.c b/opal/mca/pmix/pmix2x/pmix2x_component.c index bd8b74fc163..21785a7edf7 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_component.c +++ b/opal/mca/pmix/pmix2x/pmix2x_component.c @@ -80,6 +80,7 @@ static int external_open(void) mca_pmix_pmix2x_component.evindex = 0; OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.jobids, opal_list_t); OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.events, opal_list_t); + OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.dmdx, opal_list_t); return OPAL_SUCCESS; } @@ -88,6 +89,7 @@ static int external_close(void) { OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.jobids); OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.events); + OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.dmdx); return OPAL_SUCCESS; } diff --git a/opal/mca/pmix/pmix2x/pmix2x_server_north.c b/opal/mca/pmix/pmix2x/pmix2x_server_north.c index c08cdf27eab..5094ef3c3bf 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_server_north.c +++ b/opal/mca/pmix/pmix2x/pmix2x_server_north.c @@ -45,63 +45,73 @@ /* These are the interfaces used by the embedded PMIx server * to call up into ORTE for service requests */ - static pmix_status_t server_client_connected_fn(const pmix_proc_t *proc, void* server_object, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_client_finalized_fn(const pmix_proc_t *proc, void* server_object, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_abort_fn(const pmix_proc_t *proc, void *server_object, - int status, const char msg[], - pmix_proc_t procs[], size_t nprocs, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs, - const pmix_info_t info[], size_t ninfo, - char *data, size_t ndata, - pmix_modex_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc, - const pmix_info_t info[], size_t ninfo, - pmix_modex_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_publish_fn(const pmix_proc_t *proc, - const pmix_info_t info[], size_t ninfo, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys, +static pmix_status_t server_client_connected_fn(const pmix_proc_t *proc, void* server_object, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_client_finalized_fn(const pmix_proc_t *proc, void* server_object, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_abort_fn(const pmix_proc_t *proc, void *server_object, + int status, const char msg[], + pmix_proc_t procs[], size_t nprocs, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, - pmix_lookup_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys, + char *data, size_t ndata, + pmix_modex_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *proc, + const pmix_info_t info[], size_t ninfo, + pmix_modex_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_publish_fn(const pmix_proc_t *proc, + const pmix_info_t info[], size_t ninfo, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_lookup_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, + pmix_lookup_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_unpublish_fn(const pmix_proc_t *proc, char **keys, + const pmix_info_t info[], size_t ninfo, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_spawn_fn(const pmix_proc_t *proc, + const pmix_info_t job_info[], size_t ninfo, + const pmix_app_t apps[], size_t napps, + pmix_spawn_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_connect_fn(const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t info[], size_t ninfo, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_disconnect_fn(const pmix_proc_t procs[], size_t nprocs, const pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_spawn_fn(const pmix_proc_t *proc, - const pmix_info_t job_info[], size_t ninfo, - const pmix_app_t apps[], size_t napps, - pmix_spawn_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_connect_fn(const pmix_proc_t procs[], size_t nprocs, - const pmix_info_t info[], size_t ninfo, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_disconnect_fn(const pmix_proc_t procs[], size_t nprocs, - const pmix_info_t info[], size_t ninfo, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_register_events(pmix_status_t *codes, size_t ncodes, - const pmix_info_t info[], size_t ninfo, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_deregister_events(pmix_status_t *codes, size_t ncodes, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_notify_event(pmix_status_t code, - const pmix_proc_t *source, - pmix_data_range_t range, - pmix_info_t info[], size_t ninfo, - pmix_op_cbfunc_t cbfunc, void *cbdata); - static pmix_status_t server_query(pmix_proc_t *proct, - pmix_query_t *queryies, size_t nqueries, - pmix_info_cbfunc_t cbfunc, +static pmix_status_t server_register_events(pmix_status_t *codes, size_t ncodes, + const pmix_info_t info[], size_t ninfo, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_deregister_events(pmix_status_t *codes, size_t ncodes, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_notify_event(pmix_status_t code, + const pmix_proc_t *source, + pmix_data_range_t range, + pmix_info_t info[], size_t ninfo, + pmix_op_cbfunc_t cbfunc, void *cbdata); +static pmix_status_t server_query(pmix_proc_t *proct, + pmix_query_t *queryies, size_t nqueries, + pmix_info_cbfunc_t cbfunc, + void *cbdata); +static void server_tool_connection(pmix_info_t *info, size_t ninfo, + pmix_tool_connection_cbfunc_t cbfunc, void *cbdata); - static void server_tool_connection(pmix_info_t *info, size_t ninfo, - pmix_tool_connection_cbfunc_t cbfunc, - void *cbdata); static void server_log(const pmix_proc_t *client, const pmix_info_t data[], size_t ndata, const pmix_info_t directives[], size_t ndirs, pmix_op_cbfunc_t cbfunc, void *cbdata); - pmix_server_module_t mymodule = { +static pmix_status_t server_allocate(const pmix_proc_t *client, + pmix_alloc_directive_t directive, + const pmix_info_t data[], size_t ndata, + pmix_info_cbfunc_t cbfunc, void *cbdata); + +static pmix_status_t server_job_control(const pmix_proc_t *requestor, + const pmix_proc_t targets[], size_t ntargets, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata); + +pmix_server_module_t mymodule = { .client_connected = server_client_connected_fn, .client_finalized = server_client_finalized_fn, .abort = server_abort_fn, @@ -118,7 +128,11 @@ static void server_log(const pmix_proc_t *client, .notify_event = server_notify_event, .query = server_query, .tool_connected = server_tool_connection, - .log = server_log + .log = server_log, + .allocate = server_allocate, + .job_control = server_job_control + /* we do not support monitoring, but use the + * PMIx internal monitoring capability */ }; opal_pmix_server_module_t *host_module = NULL; @@ -252,6 +266,7 @@ static void opmdx_response(int status, const char *data, size_t sz, void *cbdata { pmix_status_t rc; pmix2x_opalcaddy_t *opalcaddy = (pmix2x_opalcaddy_t*)cbdata; + opal_pmix2x_dmx_trkr_t *dmdx; rc = pmix2x_convert_rc(status); if (NULL != opalcaddy->mdxcbfunc) { @@ -259,6 +274,13 @@ static void opmdx_response(int status, const char *data, size_t sz, void *cbdata opalcaddy->ocbdata = relcbdata; opalcaddy->mdxcbfunc(rc, data, sz, opalcaddy->cbdata, _data_release, opalcaddy); + /* if we were collecting all data, then check for any pending + * dmodx requests that we cached and notify them that the + * data has arrived */ + while (NULL != (dmdx = (opal_pmix2x_dmx_trkr_t*)opal_list_remove_first(&mca_pmix_pmix2x_component.dmdx))) { + dmdx->cbfunc(PMIX_SUCCESS, NULL, 0, dmdx->cbdata, NULL, NULL); + OBJ_RELEASE(dmdx); + } } else { OBJ_RELEASE(opalcaddy); } @@ -278,7 +300,6 @@ static pmix_status_t server_fencenb_fn(const pmix_proc_t procs[], size_t nprocs, if (NULL == host_module || NULL == host_module->fence_nb) { return PMIX_ERR_NOT_SUPPORTED; } - /* setup the caddy */ opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t); opalcaddy->mdxcbfunc = cbfunc; @@ -324,6 +345,7 @@ static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *p, opal_process_name_t proc; opal_value_t *iptr; size_t n; + opal_pmix2x_dmx_trkr_t *dmdx; if (NULL == host_module || NULL == host_module->direct_modex) { return PMIX_ERR_NOT_SUPPORTED; @@ -340,6 +362,21 @@ static pmix_status_t server_dmodex_req_fn(const pmix_proc_t *p, opalcaddy->mdxcbfunc = cbfunc; opalcaddy->cbdata = cbdata; + /* this function should only get called if we are in an async modex. + * If we are also collecting data, then the fence_nb will eventually + * complete and return all the required data down to the pmix + * server beneath us. Thus, we only need to track the dmodex_req + * and ensure that the release gets called once the data has + * arrived - this will trigger the pmix server to tell the + * client that the data is available */ + if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) { + dmdx = OBJ_NEW(opal_pmix2x_dmx_trkr_t); + dmdx->cbfunc = cbfunc; + dmdx->cbdata = cbdata; + opal_list_append(&mca_pmix_pmix2x_component.dmdx, &dmdx->super); + return PMIX_SUCCESS; + } + /* convert the array of pmix_info_t to the list of info */ for (n=0; n < ninfo; n++) { iptr = OBJ_NEW(opal_value_t); @@ -1052,3 +1089,117 @@ static void server_log(const pmix_proc_t *proct, &opalcaddy->apps, opal_opcbfunc, opalcaddy); } + +static pmix_status_t server_allocate(const pmix_proc_t *proct, + pmix_alloc_directive_t directive, + const pmix_info_t data[], size_t ndata, + pmix_info_cbfunc_t cbfunc, void *cbdata) +{ + pmix2x_opalcaddy_t *opalcaddy; + opal_process_name_t requestor; + int rc; + size_t n; + opal_value_t *oinfo; + opal_pmix_alloc_directive_t odir; + + if (NULL == host_module || NULL == host_module->allocate) { + return PMIX_ERR_NOT_SUPPORTED; + } + + /* setup the caddy */ + opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t); + opalcaddy->infocbfunc = cbfunc; + opalcaddy->cbdata = cbdata; + + /* convert the requestor */ + if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&requestor.jobid, proct->nspace))) { + OBJ_RELEASE(opalcaddy); + return pmix2x_convert_opalrc(rc); + } + requestor.vpid = pmix2x_convert_rank(proct->rank); + + /* convert the directive */ + odir = pmix2x_convert_allocdir(directive); + + /* convert the data */ + for (n=0; n < ndata; n++) { + oinfo = OBJ_NEW(opal_value_t); + opal_list_append(&opalcaddy->info, &oinfo->super); + if (OPAL_SUCCESS != (rc = pmix2x_value_unload(oinfo, &data[n].value))) { + OBJ_RELEASE(opalcaddy); + return pmix2x_convert_opalrc(rc); + } + } + + /* pass the call upwards */ + if (OPAL_SUCCESS != (rc = host_module->allocate(&requestor, odir, + &opalcaddy->info, + info_cbfunc, opalcaddy))) { + OBJ_RELEASE(opalcaddy); + return pmix2x_convert_opalrc(rc); + } + + return PMIX_SUCCESS; + +} + +static pmix_status_t server_job_control(const pmix_proc_t *proct, + const pmix_proc_t targets[], size_t ntargets, + const pmix_info_t directives[], size_t ndirs, + pmix_info_cbfunc_t cbfunc, void *cbdata) +{ + pmix2x_opalcaddy_t *opalcaddy; + opal_process_name_t requestor; + int rc; + size_t n; + opal_value_t *oinfo; + opal_namelist_t *nm; + + if (NULL == host_module || NULL == host_module->job_control) { + return PMIX_ERR_NOT_SUPPORTED; + } + + /* setup the caddy */ + opalcaddy = OBJ_NEW(pmix2x_opalcaddy_t); + opalcaddy->infocbfunc = cbfunc; + opalcaddy->cbdata = cbdata; + + /* convert the requestor */ + if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&requestor.jobid, proct->nspace))) { + OBJ_RELEASE(opalcaddy); + return pmix2x_convert_opalrc(rc); + } + requestor.vpid = pmix2x_convert_rank(proct->rank); + + /* convert the targets */ + for (n=0; n < ntargets; n++) { + nm = OBJ_NEW(opal_namelist_t); + opal_list_append(&opalcaddy->procs, &nm->super); + if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&nm->name.jobid, targets[n].nspace))) { + OBJ_RELEASE(opalcaddy); + return pmix2x_convert_opalrc(rc); + } + nm->name.vpid = pmix2x_convert_rank(targets[n].rank); + } + + /* convert the directives */ + for (n=0; n < ndirs; n++) { + oinfo = OBJ_NEW(opal_value_t); + opal_list_append(&opalcaddy->info, &oinfo->super); + if (OPAL_SUCCESS != (rc = pmix2x_value_unload(oinfo, &directives[n].value))) { + OBJ_RELEASE(opalcaddy); + return pmix2x_convert_opalrc(rc); + } + } + + /* pass the call upwards */ + if (OPAL_SUCCESS != (rc = host_module->job_control(&requestor, + &opalcaddy->procs, + &opalcaddy->info, + info_cbfunc, opalcaddy))) { + OBJ_RELEASE(opalcaddy); + return pmix2x_convert_opalrc(rc); + } + + return PMIX_SUCCESS; +} diff --git a/opal/mca/pmix/pmix_server.h b/opal/mca/pmix/pmix_server.h index d83ed39e88c..8bfaf467bb4 100644 --- a/opal/mca/pmix/pmix_server.h +++ b/opal/mca/pmix/pmix_server.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -230,6 +230,19 @@ typedef void (*opal_pmix_connection_cbfunc_t)(int incoming_sd); typedef int (*opal_pmix_server_listener_fn_t)(int listening_sd, opal_pmix_connection_cbfunc_t cbfunc); +/* Request allocation modifications on behalf of a client */ +typedef int (*opal_pmix_server_alloc_fn_t)(const opal_process_name_t *client, + opal_pmix_alloc_directive_t directive, + opal_list_t *data, + opal_pmix_info_cbfunc_t cbfunc, void *cbdata); + +/* Execute a job control action on behalf of a client */ +typedef int (*opal_pmix_server_job_control_fn_t)(const opal_process_name_t *requestor, + opal_list_t *targets, opal_list_t *directives, + opal_pmix_info_cbfunc_t cbfunc, void *cbdata); + +/* we do not provide a monitoring capability */ + typedef struct opal_pmix_server_module_1_0_0_t { opal_pmix_server_client_connected_fn_t client_connected; opal_pmix_server_client_finalized_fn_t client_finalized; @@ -249,6 +262,8 @@ typedef struct opal_pmix_server_module_1_0_0_t { opal_pmix_server_tool_connection_fn_t tool_connected; opal_pmix_server_log_fn_t log; opal_pmix_server_listener_fn_t listener; + opal_pmix_server_alloc_fn_t allocate; + opal_pmix_server_job_control_fn_t job_control; } opal_pmix_server_module_t; diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index 392c3401e49..113ea02c330 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -32,6 +32,11 @@ BEGIN_C_DECLS * that key */ #define OPAL_PMIX_RANK_WILDCARD UINT32_MAX-1 +/* other special rank values will be used to define + * groups of ranks for use in collectives */ +#define OPAL_PMIX_RANK_LOCAL_NODE UINT32_MAX-2 // all ranks on local node + + /* define a set of "standard" attributes that can * be queried. Implementations (and users) are free to extend as * desired, so the get functions need to be capable @@ -55,12 +60,15 @@ BEGIN_C_DECLS #define OPAL_PMIX_CONNECT_TO_SYSTEM "pmix.cnct.sys" // (bool) The requestor requires that a connection be made only to // a local system-level PMIx server #define OPAL_PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first +#define OPAL_PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data +#define OPAL_PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server /* identification attributes */ #define OPAL_PMIX_USERID "pmix.euid" // (uint32_t) effective user id #define OPAL_PMIX_GRPID "pmix.egid" // (uint32_t) effective group id + /* attributes for the rendezvous socket */ #define OPAL_PMIX_USOCK_DISABLE "pmix.usock.disable" // (bool) disable legacy usock support #define OPAL_PMIX_SOCKET_MODE "pmix.sockmode" // (uint32_t) POSIX mode_t (9 bits valid) @@ -76,6 +84,7 @@ BEGIN_C_DECLS #define OPAL_PMIX_TCP_DISABLE_IPV4 "pmix.tcp.disipv4" // (bool) true to disable IPv4 family #define OPAL_PMIX_TCP_DISABLE_IPV6 "pmix.tcp.disipv6" // (bool) true to disable IPv6 family + /* general proc-level attributes */ #define OPAL_PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch #define OPAL_PMIX_CREDENTIAL "pmix.cred" // (char*) security credential assigned to proc @@ -89,6 +98,7 @@ BEGIN_C_DECLS #define OPAL_PMIX_PROCDIR "pmix.pdir" // (char*) sub-nsdir assigned to proc #define OPAL_PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" // (bool) Resource Manager will clean session directories + /* information about relative ranks as assigned by the RM */ #define OPAL_PMIX_PROCID "pmix.procid" // (opal_process_name_t) process identifier #define OPAL_PMIX_NSPACE "pmix.nspace" // (char*) nspace of a job @@ -104,25 +114,26 @@ BEGIN_C_DECLS #define OPAL_PMIX_LOCALLDR "pmix.lldr" // (uint64_t) opal_identifier of lowest rank on this node within this job #define OPAL_PMIX_APPLDR "pmix.aldr" // (uint32_t) lowest rank in this app within this job #define OPAL_PMIX_PROC_PID "pmix.ppid" // (pid_t) pid of specified proc - -/**** no PMIx equivalent ****/ -#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs -#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string -#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location -#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node - +#define OPAL_PMIX_SESSION_ID "pmix.session.id" // (uint32_t) session identifier #define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace #define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of // whether or not they currently host procs. #define OPAL_PMIX_HOSTNAME "pmix.hname" // (char*) name of the host the specified proc is on #define OPAL_PMIX_NODEID "pmix.nodeid" // (uint32_t) node identifier #define OPAL_PMIX_LOCAL_PEERS "pmix.lpeers" // (char*) comma-delimited string of ranks on this node within the specified nspace +#define OPAL_PMIX_LOCAL_PROCS "pmix.lprocs" // (opal_list_t*) list of opal_namelist_t of procs on the specified node #define OPAL_PMIX_LOCAL_CPUSETS "pmix.lcpus" // (char*) colon-delimited cpusets of local peers within the specified nspace #define OPAL_PMIX_PROC_URI "opal.puri" // (char*) URI containing contact info for proc - NOTE: this is published by procs and // thus cannot be prefixed with "pmix" +#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs + + +/* Memory info */ +#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node #define OPAL_PMIX_DAEMON_MEMORY "pmix.dmn.mem" // (float) Mbytes of memory currently used by daemon #define OPAL_PMIX_CLIENT_AVG_MEMORY "pmix.cl.mem.avg" // (float) Average Mbytes of memory used by client processes + /* size info */ #define OPAL_PMIX_UNIV_SIZE "pmix.univ.size" // (uint32_t) #procs in this nspace #define OPAL_PMIX_JOB_SIZE "pmix.job.size" // (uint32_t) #procs in this job @@ -133,11 +144,15 @@ BEGIN_C_DECLS #define OPAL_PMIX_MAX_PROCS "pmix.max.size" // (uint32_t) max #procs for this job #define OPAL_PMIX_NUM_NODES "pmix.num.nodes" // (uint32_t) #nodes in this nspace + /* topology info */ #define OPAL_PMIX_NET_TOPO "pmix.ntopo" // (char*) xml-representation of network topology #define OPAL_PMIX_LOCAL_TOPO "pmix.ltopo" // (char*) xml-representation of local node topology #define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for this job #define OPAL_PMIX_TOPOLOGY "pmix.topo" // (hwloc_topology_t) pointer to the PMIx client's internal topology object +#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string +#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location + /* request-related info */ #define OPAL_PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation @@ -156,16 +171,19 @@ BEGIN_C_DECLS #define OPAL_PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the // specified operation + /* attribute used by host server to pass data to the server convenience library - the * data will then be parsed and provided to the local clients */ #define OPAL_PMIX_PROC_DATA "pmix.pdata" // (pmix_value_array_t) starts with rank, then contains more data #define OPAL_PMIX_NODE_MAP "pmix.nmap" // (char*) regex of nodes containing procs for this job #define OPAL_PMIX_PROC_MAP "pmix.pmap" // (char*) regex describing procs on each node within this job + /* attributes used internally to communicate data from the server to the client */ #define OPAL_PMIX_PROC_BLOB "pmix.pblob" // (pmix_byte_object_t) packed blob of process data #define OPAL_PMIX_MAP_BLOB "pmix.mblob" // (pmix_byte_object_t) packed blob of process location + /* error handler registration and notification info keys */ #define OPAL_PMIX_EVENT_HDLR_NAME "pmix.evname" // (char*) string name identifying this handler #define OPAL_PMIX_EVENT_JOB_LEVEL "pmix.evjob" // (bool) register for job-specific events only @@ -187,7 +205,7 @@ BEGIN_C_DECLS #define OPAL_PMIX_EVENT_ACTION_TIMEOUT "pmix.evtimeout" // (int) time in sec before RM will execute error response -/* attributes used to describe "spawm" attributes */ +/* attributes used to describe "spawn" attributes */ #define OPAL_PMIX_PERSONALITY "pmix.pers" // (char*) name of personality to use #define OPAL_PMIX_HOST "pmix.host" // (char*) comma-delimited list of hosts to use for spawned procs #define OPAL_PMIX_HOSTFILE "pmix.hostfile" // (char*) hostfile to use for spawned procs @@ -229,19 +247,89 @@ BEGIN_C_DECLS #define OPAL_PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only #define OPAL_PMIX_QUERY_REPORT_AVG "pmix.qry.avg" // report average values #define OPAL_PMIX_QUERY_REPORT_MINMAX "pmix.qry.minmax" // report minimum and maximum value +#define OPAL_PMIX_QUERY_ALLOC_STATUS "pmix.query.alloc" // (char*) string identifier of the allocation whose status + // is being requested +#define OPAL_PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation + // for the specified nspace /* log attributes */ -#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr -#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout -#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless -#define OPAL_PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere +#define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr +#define OPAL_PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout +#define OPAL_PMIX_LOG_SYSLOG "pmix.log.syslog" // (char*) log data to syslog - defaults to ERROR priority unless +#define OPAL_PMIX_LOG_MSG "pmix.log.msg" // (pmix_byte_object_t) message blob to be sent somewhere +#define OPAL_PMIX_LOG_EMAIL "pmix.log.email" // (pmix_data_array_t) log via email based on pmix_info_t containing directives +#define OPAL_PMIX_LOG_EMAIL_ADDR "pmix.log.emaddr" // (char*) comma-delimited list of email addresses that are to recv msg +#define OPAL_PMIX_LOG_EMAIL_SUBJECT "pmix.log.emsub" // (char*) subject line for email +#define OPAL_PMIX_LOG_EMAIL_MSG "pmix.log.emmsg" // (char*) msg to be included in email + /* debugger attributes */ -#define OPAL_PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start -#define OPAL_PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init -#define OPAL_PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification -#define OPAL_PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are -#define OPAL_PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release +#define OPAL_PMIX_DEBUG_STOP_ON_EXEC "pmix.dbg.exec" // (bool) job is being spawned under debugger - instruct it to pause on start +#define OPAL_PMIX_DEBUG_STOP_IN_INIT "pmix.dbg.init" // (bool) instruct job to stop during PMIx init +#define OPAL_PMIX_DEBUG_WAIT_FOR_NOTIFY "pmix.dbg.notify" // (bool) block at desired point until receiving debugger release notification +#define OPAL_PMIX_DEBUG_JOB "pmix.dbg.job" // (char*) nspace of the job to be debugged - the RM/PMIx server are +#define OPAL_PMIX_DEBUG_WAITING_FOR_NOTIFY "pmix.dbg.waiting" // (bool) job to be debugged is waiting for a release + + +/* Resource Manager identification */ +#define OPAL_PMIX_RM_NAME "pmix.rm.name" // (char*) string name of the resource manager +#define OPAL_PMIX_RM_VERSION "pmix.rm.version" // (char*) RM version string + + +/* attributes for setting envars */ +#define OPAL_PMIX_SET_ENVAR "pmix.set.envar" // (char*) string "key=value" value shall be put into the environment +#define OPAL_PMIX_UNSET_ENVAR "pmix.unset.envar" // (char*) unset envar specified in string + + +/* attributes relating to allocations */ +#define OPAL_PMIX_ALLOC_ID "pmix.alloc.id" // (char*) provide a string identifier for this allocation request + // which can later be used to query status of the request +#define OPAL_PMIX_ALLOC_NUM_NODES "pmix.alloc.nnodes" // (uint64_t) number of nodes +#define OPAL_PMIX_ALLOC_NODE_LIST "pmix.alloc.nlist" // (char*) regex of specific nodes +#define OPAL_PMIX_ALLOC_NUM_CPUS "pmix.alloc.ncpus" // (uint64_t) number of cpus +#define OPAL_PMIX_ALLOC_NUM_CPU_LIST "pmix.alloc.ncpulist" // (char*) regex of #cpus for each node +#define OPAL_PMIX_ALLOC_CPU_LIST "pmix.alloc.cpulist" // (char*) regex of specific cpus indicating the cpus involved. +#define OPAL_PMIX_ALLOC_MEM_SIZE "pmix.alloc.msize" // (float) number of Mbytes +#define OPAL_PMIX_ALLOC_NETWORK "pmix.alloc.net" // (array) array of pmix_info_t describing network resources. If not + // given as part of an info struct that identifies the + // impacted nodes, then the description will be applied + // across all nodes in the requestor's allocation +#define OPAL_PMIX_ALLOC_NETWORK_ID "pmix.alloc.netid" // (char*) name of network +#define OPAL_PMIX_ALLOC_BANDWIDTH "pmix.alloc.bw" // (float) Mbits/sec +#define OPAL_PMIX_ALLOC_NETWORK_QOS "pmix.alloc.netqos" // (char*) quality of service level +#define OPAL_PMIX_ALLOC_TIME "pmix.alloc.time" // (uint32_t) time in seconds + + +/* job control attributes */ +#define OPAL_PMIX_JOB_CTRL_ID "pmix.jctrl.id" // (char*) provide a string identifier for this request +#define OPAL_PMIX_JOB_CTRL_PAUSE "pmix.jctrl.pause" // (bool) pause the specified processes +#define OPAL_PMIX_JOB_CTRL_RESUME "pmix.jctrl.resume" // (bool) "un-pause" the specified processes +#define OPAL_PMIX_JOB_CTRL_CANCEL "pmix.jctrl.cancel" // (char*) cancel the specified request + // (NULL => cancel all requests from this requestor) +#define OPAL_PMIX_JOB_CTRL_KILL "pmix.jctrl.kill" // (bool) forcibly terminate the specified processes and cleanup +#define OPAL_PMIX_JOB_CTRL_RESTART "pmix.jctrl.restart" // (char*) restart the specified processes using the given checkpoint ID +#define OPAL_PMIX_JOB_CTRL_CHECKPOINT "pmix.jctrl.ckpt" // (char*) checkpoint the specified processes and assign the given ID to it +#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_EVENT "pmix.jctrl.ckptev" // (bool) use event notification to trigger process checkpoint +#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_SIGNAL "pmix.jctrl.ckptsig" // (int) use the given signal to trigger process checkpoint +#define OPAL_PMIX_JOB_CTRL_CHECKPOINT_TIMEOUT "pmix.jctrl.ckptsig" // (int) time in seconds to wait for checkpoint to complete +#define OPAL_PMIX_JOB_CTRL_SIGNAL "pmix.jctrl.sig" // (int) send given signal to specified processes +#define OPAL_PMIX_JOB_CTRL_PROVISION "pmix.jctrl.pvn" // (char*) regex identifying nodes that are to be provisioned +#define OPAL_PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned +#define OPAL_PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted + +/* monitoring attributes */ +#define OPAL_PMIX_MONITOR_HEARTBEAT "pmix.monitor.mbeat" // (void) register to have the server monitor the requestor for heartbeats +#define OPAL_PMIX_SEND_HEARTBEAT "pmix.monitor.beat" // (void) send heartbeat to local server +#define OPAL_PMIX_MONITOR_HEARTBEAT_TIME "pmix.monitor.btime" // (uint32_t) time in seconds before declaring heartbeat missed +#define OPAL_PMIX_MONITOR_HEARTBEAT_DROPS "pmix.monitor.bdrop" // (uint32_t) number of heartbeats that can be missed before taking + // specified action +#define OPAL_PMIX_MONITOR_FILE "pmix.monitor.fmon" // (char*) register to monitor file for signs of life +#define OPAL_PMIX_MONITOR_FILE_SIZE "pmix.monitor.fsize" // (bool) monitor size of given file is growing to determine app is running +#define OPAL_PMIX_MONITOR_FILE_ACCESS "pmix.monitor.faccess" // (char*) monitor time since last access of given file to determine app is running +#define OPAL_PMIX_MONITOR_FILE_MODIFY "pmix.monitor.fmod" // (char*) monitor time since last modified of given file to determine app is running +#define OPAL_PMIX_MONITOR_FILE_CHECK_TIME "pmix.monitor.ftime" // (uint32_t) time in seconds between checking file +#define OPAL_PMIX_MONITOR_FILE_DROPS "pmix.monitor.fdrop" // (uint32_t) number of file checks that can be missed before taking + // specified action /* define a scope for data "put" by PMI per the following: @@ -285,6 +373,16 @@ typedef enum { } opal_pmix_persistence_t; +/* define allocation request flags */ +typedef enum { + OPAL_PMIX_ALLOC_UNDEF = 0, + OPAL_PMIX_ALLOC_NEW, + OPAL_PMIX_ALLOC_EXTEND, + OPAL_PMIX_ALLOC_RELEASE, + OPAL_PMIX_ALLOC_REAQCUIRE +} opal_pmix_alloc_directive_t; + + /**** PMIX INFO STRUCT ****/ /* NOTE: the pmix_info_t is essentially equivalent to the opal_value_t diff --git a/opal/util/cmd_line.c b/opal/util/cmd_line.c index f3383490e62..3261e9a3b90 100644 --- a/opal/util/cmd_line.c +++ b/opal/util/cmd_line.c @@ -10,12 +10,12 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -70,6 +70,7 @@ struct cmd_line_option_t { char *clo_mca_param_env_var; void *clo_variable_dest; bool clo_variable_set; + opal_cmd_line_otype_t clo_otype; }; typedef struct cmd_line_option_t cmd_line_option_t; static void option_constructor(cmd_line_option_t *cmd); @@ -141,6 +142,8 @@ static cmd_line_option_t *find_option(opal_cmd_line_t *cmd, static int set_dest(cmd_line_option_t *option, char *sval); static void fill(const cmd_line_option_t *a, char result[3][BUFSIZ]); static int qsort_callback(const void *a, const void *b); +static opal_cmd_line_otype_t get_help_otype(opal_cmd_line_t *cmd); +static char *build_parsable(cmd_line_option_t *option); /* @@ -255,6 +258,7 @@ int opal_cmd_line_parse(opal_cmd_line_t *cmd, bool ignore_unknown, bool ignore_u int num_args_used; bool have_help_option = false; bool printed_error = false; + bool help_without_arg = false; /* Bozo check */ @@ -394,10 +398,17 @@ int opal_cmd_line_parse(opal_cmd_line_t *cmd, bool ignore_unknown, bool ignore_u recognized */ for (j = 0; j < option->clo_num_params; ++j, ++i) { - - /* If we run out of parameters, error */ - + /* If we run out of parameters, error, unless its a help request + which can have 0 or 1 arguments */ if (i >= cmd->lcl_argc) { + /* If this is a help request, can have no arguments */ + if((NULL != option->clo_single_dash_name && + 0 == strcmp(option->clo_single_dash_name, "h")) || + (NULL != option->clo_long_name && + 0 == strcmp(option->clo_long_name, "help"))) { + help_without_arg = true; + continue; + } fprintf(stderr, "%s: Error: option \"%s\" did not " "have enough parameters (%d)\n", cmd->lcl_argv[0], @@ -454,10 +465,11 @@ int opal_cmd_line_parse(opal_cmd_line_t *cmd, bool ignore_unknown, bool ignore_u } } - /* If there are no options to this command, see if we - need to set a boolean value to "true". */ + /* If there are no options to this command or it is + a help request with no argument, see if we need to + set a boolean value to "true". */ - if (0 == option->clo_num_params) { + if (0 == option->clo_num_params || help_without_arg) { if (OPAL_SUCCESS != (ret = set_dest(option, "1"))) { opal_mutex_unlock(&cmd->lcl_mutex); return ret; @@ -524,6 +536,7 @@ char *opal_cmd_line_get_usage_msg(opal_cmd_line_t *cmd) char *start, *desc, *ptr; opal_list_item_t *item; cmd_line_option_t *option, **sorted; + opal_cmd_line_otype_t otype; /* Thread serialization */ @@ -550,135 +563,125 @@ char *opal_cmd_line_get_usage_msg(opal_cmd_line_t *cmd) } qsort(sorted, i, sizeof(cmd_line_option_t*), qsort_callback); + /* Find if a help argument was passed, and return its type if it was. */ + + otype = get_help_otype(cmd); + /* Now go through the sorted array and make the strings */ for (j = 0; j < opal_list_get_size(&cmd->lcl_options); ++j) { option = sorted[j]; - if (NULL != option->clo_description) { - bool filled = false; - - /* Build up the output line */ - - memset(line, 0, sizeof(line)); - if ('\0' != option->clo_short_name) { - line[0] = '-'; - line[1] = option->clo_short_name; - filled = true; - } else { - line[0] = ' '; - line[1] = ' '; - } - if (NULL != option->clo_single_dash_name) { - line[2] = (filled) ? '|' : ' '; - strncat(line, "-", sizeof(line) - 1); - strncat(line, option->clo_single_dash_name, sizeof(line) - 1); - filled = true; - } - if (NULL != option->clo_long_name) { - if (filled) { - strncat(line, "|", sizeof(line) - 1); + if(otype == OPAL_CMD_LINE_OTYPE_PARSABLE) { + ret = build_parsable(option); + opal_argv_append(&argc, &argv, ret); + free(ret); + ret = NULL; + } else if(otype == OPAL_CMD_LINE_OTYPE_NULL || option->clo_otype == otype) { + if (NULL != option->clo_description) { + bool filled = false; + + /* Build up the output line */ + + memset(line, 0, sizeof(line)); + if ('\0' != option->clo_short_name) { + line[0] = '-'; + line[1] = option->clo_short_name; + filled = true; } else { - strncat(line, " ", sizeof(line) - 1); + line[0] = ' '; + line[1] = ' '; + } + if (NULL != option->clo_single_dash_name) { + line[2] = (filled) ? '|' : ' '; + strncat(line, "-", sizeof(line) - 1); + strncat(line, option->clo_single_dash_name, sizeof(line) - 1); + filled = true; + } + if (NULL != option->clo_long_name) { + if (filled) { + strncat(line, "|", sizeof(line) - 1); + } else { + strncat(line, " ", sizeof(line) - 1); + } + strncat(line, "--", sizeof(line) - 1); + strncat(line, option->clo_long_name, sizeof(line) - 1); } - strncat(line, "--", sizeof(line) - 1); - strncat(line, option->clo_long_name, sizeof(line) - 1); - } - strncat(line, " ", sizeof(line) - 1); - for (i = 0; (int)i < option->clo_num_params; ++i) { - len = sizeof(temp); - snprintf(temp, len, " ", (int)i); - strncat(line, temp, sizeof(line) - 1); - } - if (option->clo_num_params > 0) { strncat(line, " ", sizeof(line) - 1); - } + for (i = 0; (int)i < option->clo_num_params; ++i) { + len = sizeof(temp); + snprintf(temp, len, " ", (int)i); + strncat(line, temp, sizeof(line) - 1); + } + if (option->clo_num_params > 0) { + strncat(line, " ", sizeof(line) - 1); + } - /* If we're less than param width, then start adding the - description to this line. Otherwise, finish this line - and start adding the description on the next line. */ + /* If we're less than param width, then start adding the + description to this line. Otherwise, finish this line + and start adding the description on the next line. */ - if (strlen(line) > PARAM_WIDTH) { - opal_argv_append(&argc, &argv, line); + if (strlen(line) > PARAM_WIDTH) { + opal_argv_append(&argc, &argv, line); - /* Now reset the line to be all blanks up to - PARAM_WIDTH so that we can start adding the - description */ + /* Now reset the line to be all blanks up to + PARAM_WIDTH so that we can start adding the + description */ - memset(line, ' ', PARAM_WIDTH); - line[PARAM_WIDTH] = '\0'; - } else { + memset(line, ' ', PARAM_WIDTH); + line[PARAM_WIDTH] = '\0'; + } else { - /* Add enough blanks to the end of the line so that we - can start adding the description */ + /* Add enough blanks to the end of the line so that we + can start adding the description */ - for (i = strlen(line); i < PARAM_WIDTH; ++i) { - line[i] = ' '; + for (i = strlen(line); i < PARAM_WIDTH; ++i) { + line[i] = ' '; + } + line[i] = '\0'; } - line[i] = '\0'; - } - - /* Loop over adding the description to the array, breaking - the string at most at MAX_WIDTH characters. We need a - modifyable description (for simplicity), so strdup the - clo_description (because it's likely a compiler - constant, and may barf if we write temporary \0's in - the middle). */ - - desc = strdup(option->clo_description); - if (NULL == desc) { - free(sorted); - opal_mutex_unlock(&cmd->lcl_mutex); - return strdup(""); - } - start = desc; - len = strlen(desc); - do { - /* Trim off leading whitespace */ + /* Loop over adding the description to the array, breaking + the string at most at MAX_WIDTH characters. We need a + modifyable description (for simplicity), so strdup the + clo_description (because it's likely a compiler + constant, and may barf if we write temporary \0's in + the middle). */ - while (isspace(*start) && start < desc + len) { - ++start; - } - if (start >= desc + len) { - break; + desc = strdup(option->clo_description); + if (NULL == desc) { + free(sorted); + opal_mutex_unlock(&cmd->lcl_mutex); + return strdup(""); } + start = desc; + len = strlen(desc); + do { - /* Last line */ + /* Trim off leading whitespace */ - if (strlen(start) < (MAX_WIDTH - PARAM_WIDTH)) { - strncat(line, start, sizeof(line) - 1); - opal_argv_append(&argc, &argv, line); - break; - } + while (isspace(*start) && start < desc + len) { + ++start; + } + if (start >= desc + len) { + break; + } - /* We have more than 1 line's worth left -- find this - line's worth and add it to the array. Then reset - and loop around to get the next line's worth. */ + /* Last line */ - for (ptr = start + (MAX_WIDTH - PARAM_WIDTH); - ptr > start; --ptr) { - if (isspace(*ptr)) { - *ptr = '\0'; + if (strlen(start) < (MAX_WIDTH - PARAM_WIDTH)) { strncat(line, start, sizeof(line) - 1); opal_argv_append(&argc, &argv, line); - - start = ptr + 1; - memset(line, ' ', PARAM_WIDTH); - line[PARAM_WIDTH] = '\0'; break; } - } - /* If we got all the way back to the beginning of the - string, then go forward looking for a whitespace - and break there. */ + /* We have more than 1 line's worth left -- find this + line's worth and add it to the array. Then reset + and loop around to get the next line's worth. */ - if (ptr == start) { for (ptr = start + (MAX_WIDTH - PARAM_WIDTH); - ptr < start + len; ++ptr) { + ptr > start; --ptr) { if (isspace(*ptr)) { *ptr = '\0'; - strncat(line, start, sizeof(line) - 1); opal_argv_append(&argc, &argv, line); @@ -689,19 +692,45 @@ char *opal_cmd_line_get_usage_msg(opal_cmd_line_t *cmd) } } - /* If we reached the end of the string with no - whitespace, then just add it on and be done */ + /* If we got all the way back to the beginning of the + string, then go forward looking for a whitespace + and break there. */ - if (ptr >= start + len) { - strncat(line, start, sizeof(line) - 1); - opal_argv_append(&argc, &argv, line); - start = desc + len + 1; + if (ptr == start) { + for (ptr = start + (MAX_WIDTH - PARAM_WIDTH); + ptr < start + len; ++ptr) { + if (isspace(*ptr)) { + *ptr = '\0'; + + strncat(line, start, sizeof(line) - 1); + opal_argv_append(&argc, &argv, line); + + start = ptr + 1; + memset(line, ' ', PARAM_WIDTH); + line[PARAM_WIDTH] = '\0'; + break; + } + } + + /* If we reached the end of the string with no + whitespace, then just add it on and be done */ + + if (ptr >= start + len) { + strncat(line, start, sizeof(line) - 1); + opal_argv_append(&argc, &argv, line); + start = desc + len + 1; + } } - } - } while (start < desc + len); - free(desc); + } while (start < desc + len); + free(desc); + } } } + if(otype == OPAL_CMD_LINE_OTYPE_NULL || otype == OPAL_CMD_LINE_OTYPE_GENERAL) { + char *argument_line = "\nFor additional mpirun arguments, run 'mpirun --help '\n\nThe following categories exist: general (Defaults to this option), debug,\n output, input, mapping, ranking, binding, devel (arguments useful to OMPI\n Developers), compatibility (arguments supported for backwards compatibility),\n launch (arguments to modify launch options), and dvm (Distributed Virtual\n Machine arguments)."; + + opal_argv_append(&argc, &argv, argument_line); + } if (NULL != argv) { ret = opal_argv_join(argv, '\n'); opal_argv_free(argv); @@ -798,7 +827,7 @@ char *opal_cmd_line_get_param(opal_cmd_line_t *cmd, const char *opt, int inst, opal_list_get_end(&cmd->lcl_params) != item; item = opal_list_get_next(item)) { param = (cmd_line_param_t *) item; - if (param->clp_option == option) { + if (param->clp_argc > 0 && param->clp_option == option) { if (num_found == inst) { opal_mutex_unlock(&cmd->lcl_mutex); return param->clp_argv[idx]; @@ -872,6 +901,7 @@ static void option_constructor(cmd_line_option_t *o) o->clo_mca_param_env_var = NULL; o->clo_variable_dest = NULL; o->clo_variable_set = false; + o->clo_otype = OPAL_CMD_LINE_OTYPE_NULL; } @@ -915,7 +945,7 @@ static void cmd_line_constructor(opal_cmd_line_t *cmd) only thread that has this instance), there's no need to lock it right now. */ - OBJ_CONSTRUCT(&cmd->lcl_mutex, opal_mutex_t); + OBJ_CONSTRUCT(&cmd->lcl_mutex, opal_recursive_mutex_t); /* Initialize the lists */ @@ -1012,6 +1042,8 @@ static int make_opt(opal_cmd_line_t *cmd, opal_cmd_line_init_t *e) &option->clo_mca_param_env_var); } + option->clo_otype = e->ocl_otype; + /* Append the item, serializing thread access */ opal_mutex_lock(&cmd->lcl_mutex); @@ -1307,3 +1339,80 @@ static int qsort_callback(const void *aa, const void *bb) return 0; } + + +/* + * Helper function to find the option type specified in the help + * command. + */ +static opal_cmd_line_otype_t get_help_otype(opal_cmd_line_t *cmd) +{ + /* Initialize to NULL, if it remains so, the user asked for + "full" help output */ + opal_cmd_line_otype_t otype = OPAL_CMD_LINE_OTYPE_NULL; + char *arg; + + arg = opal_cmd_line_get_param(cmd, "help", 0, 0); + + /* If not "help", check for "h" */ + if(NULL == arg) { + arg = opal_cmd_line_get_param(cmd, "h", 0, 0); + } + + /* If arg is still NULL, give them the General info by default */ + if(NULL == arg) { + arg = "general"; + } + + if (0 == strcmp(arg, "debug")) { + otype = OPAL_CMD_LINE_OTYPE_DEBUG; + } else if (0 == strcmp(arg, "output")) { + otype = OPAL_CMD_LINE_OTYPE_OUTPUT; + } else if (0 == strcmp(arg, "input")) { + otype = OPAL_CMD_LINE_OTYPE_INPUT; + } else if (0 == strcmp(arg, "mapping")) { + otype = OPAL_CMD_LINE_OTYPE_MAPPING; + } else if (0 == strcmp(arg, "ranking")) { + otype = OPAL_CMD_LINE_OTYPE_RANKING; + } else if (0 == strcmp(arg, "binding")) { + otype = OPAL_CMD_LINE_OTYPE_BINDING; + } else if (0 == strcmp(arg, "devel")) { + otype = OPAL_CMD_LINE_OTYPE_DEVEL; + } else if (0 == strcmp(arg, "compatibility")) { + otype = OPAL_CMD_LINE_OTYPE_COMPAT; + } else if (0 == strcmp(arg, "launch")) { + otype = OPAL_CMD_LINE_OTYPE_LAUNCH; + } else if (0 == strcmp(arg, "dvm")) { + otype = OPAL_CMD_LINE_OTYPE_DVM; + } else if (0 == strcmp(arg, "general")) { + otype = OPAL_CMD_LINE_OTYPE_GENERAL; + } else if (0 == strcmp(arg, "parsable")) { + otype = OPAL_CMD_LINE_OTYPE_PARSABLE; + } + + return otype; +} + +/* + * Helper function to build a parsable string for the help + * output. + */ +static char *build_parsable(cmd_line_option_t *option) { + char *line; + int length; + + length = snprintf(NULL, 0, "%c:%s:%s:%d:%s\n", option->clo_short_name, option->clo_single_dash_name, + option->clo_long_name, option->clo_num_params, option->clo_description); + + line = (char *)malloc(length * sizeof(char)); + + if('\0' == option->clo_short_name) { + snprintf(line, length, "0:%s:%s:%d:%s\n", option->clo_single_dash_name, option->clo_long_name, + option->clo_num_params, option->clo_description); + } else { + snprintf(line, length, "%c:%s:%s:%d:%s\n", option->clo_short_name, option->clo_single_dash_name, + option->clo_long_name, option->clo_num_params, option->clo_description); + } + + return line; +} diff --git a/opal/util/cmd_line.h b/opal/util/cmd_line.h index d42c5551956..2f23d0477dc 100644 --- a/opal/util/cmd_line.h +++ b/opal/util/cmd_line.h @@ -11,8 +11,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. - * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -132,7 +132,7 @@ BEGIN_C_DECLS opal_object_t super; /** Thread safety */ - opal_mutex_t lcl_mutex; + opal_recursive_mutex_t lcl_mutex; /** List of cmd_line_option_t's (defined internally) */ opal_list_t lcl_options; @@ -176,6 +176,33 @@ BEGIN_C_DECLS */ typedef enum opal_cmd_line_type_t opal_cmd_line_type_t; + /** + * Command line option type, for use in + * mpirun --help output. + */ + enum opal_cmd_line_otype_t { + OPAL_CMD_LINE_OTYPE_GENERAL, + OPAL_CMD_LINE_OTYPE_DEBUG, + OPAL_CMD_LINE_OTYPE_OUTPUT, + OPAL_CMD_LINE_OTYPE_INPUT, + OPAL_CMD_LINE_OTYPE_MAPPING, + OPAL_CMD_LINE_OTYPE_RANKING, + OPAL_CMD_LINE_OTYPE_BINDING, + OPAL_CMD_LINE_OTYPE_DEVEL, + OPAL_CMD_LINE_OTYPE_COMPAT, /* Backwards compatibility */ + OPAL_CMD_LINE_OTYPE_LAUNCH, + OPAL_CMD_LINE_OTYPE_DVM, + OPAL_CMD_LINE_OTYPE_UNSUPPORTED, + OPAL_CMD_LINE_OTYPE_PARSABLE, + OPAL_CMD_LINE_OTYPE_NULL + }; + /** + * \internal + * + * Convenience typedef + */ + typedef enum opal_cmd_line_otype_t opal_cmd_line_otype_t; + /** * Datatype used to construct a command line handle; see * opal_cmd_line_create(). @@ -207,6 +234,9 @@ BEGIN_C_DECLS /** Description of the command line option, to be used with opal_cmd_line_get_usage_msg(). */ const char *ocl_description; + + /** Category for mpirun --help output */ + opal_cmd_line_otype_t ocl_otype; }; /** * \internal diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c index 4f0f47b501c..1109c360e21 100644 --- a/orte/mca/ess/alps/ess_alps_module.c +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -12,6 +12,7 @@ * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -58,7 +59,6 @@ static int rte_init(void) { int ret; char *error = NULL; - char **hosts = NULL; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, "ess:alps in rte_init")); @@ -90,23 +90,11 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (NULL != orte_node_regex) { - /* extract the nodes */ - if (ORTE_SUCCESS != (ret = - orte_regex_extract_node_names(orte_node_regex, &hosts)) || - NULL == hosts) { - error = "orte_regex_extract_node_names"; - goto fn_fail; - } - } - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto fn_fail; } - if (NULL != hosts) { - opal_argv_free(hosts); - } /* * now synchronize with aprun. diff --git a/orte/mca/ess/base/base.h b/orte/mca/ess/base/base.h index 4387a5e98d8..2fefed08455 100644 --- a/orte/mca/ess/base/base.h +++ b/orte/mca/ess/base/base.h @@ -12,7 +12,7 @@ * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved. - * Copyright (c) 2013 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -67,7 +67,7 @@ ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report); ORTE_DECLSPEC int orte_ess_base_tool_setup(void); ORTE_DECLSPEC int orte_ess_base_tool_finalize(void); -ORTE_DECLSPEC int orte_ess_base_orted_setup(char **hosts); +ORTE_DECLSPEC int orte_ess_base_orted_setup(void); ORTE_DECLSPEC int orte_ess_base_orted_finalize(void); /* Detect whether or not this proc is bound - if not, diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index ce6bdd5fe9b..a3e3e2d44fc 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -103,7 +103,7 @@ static void setup_sighandler(int signal, opal_event_t *ev, } -int orte_ess_base_orted_setup(char **hosts) +int orte_ess_base_orted_setup(void) { int ret = ORTE_ERROR; int fd; @@ -113,7 +113,6 @@ int orte_ess_base_orted_setup(char **hosts) orte_job_t *jdata; orte_proc_t *proc; orte_app_context_t *app; - orte_node_t *node; char *param; hwloc_obj_t obj; unsigned i, j; @@ -218,12 +217,9 @@ int orte_ess_base_orted_setup(char **hosts) * a specific module to use */ (void) mca_base_var_env_name("plm", ¶m); - plm_in_use = !!(getenv(param)); free (param); - if (plm_in_use) { - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; @@ -332,11 +328,6 @@ int orte_ess_base_orted_setup(char **hosts) app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; - /* create and store a node object where we are */ - node = OBJ_NEW(orte_node_t); - node->name = strdup(orte_process_info.nodename); - node->index = ORTE_PROC_MY_NAME->vpid; - opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); @@ -345,19 +336,6 @@ int orte_ess_base_orted_setup(char **hosts) proc->pid = orte_process_info.pid; proc->state = ORTE_PROC_STATE_RUNNING; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); - /* record that the daemon (i.e., us) is on this node - * NOTE: we do not add the proc object to the node's - * proc array because we are not an application proc. - * Instead, we record it in the daemon field of the - * node object - */ - OBJ_RETAIN(proc); /* keep accounting straight */ - node->daemon = proc; - ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED); - node->state = ORTE_NODE_STATE_UP; - /* now point our proc node field to the node */ - OBJ_RETAIN(node); /* keep accounting straight */ - proc->node = node; /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; @@ -514,7 +492,6 @@ int orte_ess_base_orted_setup(char **hosts) orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology); t->sig = strdup(orte_topo_signature); opal_pointer_array_add(orte_node_topologies, t); - node->topology = t; if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); @@ -526,12 +503,25 @@ int orte_ess_base_orted_setup(char **hosts) * after we enable_comm as that function determines our * own port, which we need in order to construct the nidmap */ - if (NULL != hosts) { + if (NULL != orte_node_regex) { + if (ORTE_SUCCESS != (ret = orte_util_nidmap_parse(orte_node_regex))) { + ORTE_ERROR_LOG(ret); + error = "construct nidmap"; + goto error; + } + } + + if (orte_static_ports) { + if (NULL == orte_node_regex) { + /* we didn't get the node info */ + error = "cannot construct daemon map for static ports - no node map info"; + goto error; + } /* extract the node info from the environment and * build a nidmap from it - this will update the * routing plan as well */ - if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) { + if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap())) { ORTE_ERROR_LOG(ret); error = "construct daemon map from static ports"; goto error; @@ -635,6 +625,7 @@ int orte_ess_base_orted_setup(char **hosts) } return ORTE_SUCCESS; + error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index c04b8c0c83f..bc4152e23e4 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -98,7 +98,6 @@ static int rte_init(void) { int ret; char *error = NULL; - char **hosts = NULL; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -112,19 +111,11 @@ static int rte_init(void) /* if I am a daemon, complete my setup using the * default procedure */ - if (NULL != orte_node_regex) { - /* extract the nodes */ - if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) { - error = "orte_regex_extract_node_names"; - goto error; - } - } - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - opal_argv_free(hosts); return ORTE_SUCCESS; error: diff --git a/orte/mca/ess/lsf/ess_lsf_module.c b/orte/mca/ess/lsf/ess_lsf_module.c index f9aef64269c..cb200e4df3c 100644 --- a/orte/mca/ess/lsf/ess_lsf_module.c +++ b/orte/mca/ess/lsf/ess_lsf_module.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -68,7 +68,6 @@ static int rte_init(void) { int ret; char *error = NULL; - char **hosts = NULL; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -83,19 +82,11 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (NULL != orte_node_regex) { - /* extract the nodes */ - if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) { - error = "orte_regex_extract_node_names"; - goto error; - } - } - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - opal_argv_free(hosts); return ORTE_SUCCESS; } diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 472b6aa9ee1..c645c4ecaa0 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -10,7 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -62,7 +62,6 @@ static int rte_init(void) { int ret; char *error = NULL; - char **hosts = NULL; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -77,23 +76,11 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (NULL != orte_node_regex) { - /* extract the nodes */ - if (ORTE_SUCCESS != (ret = - orte_regex_extract_node_names(orte_node_regex, &hosts)) || - NULL == hosts) { - error = "orte_regex_extract_node_names"; - goto error; - } - } - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - if (NULL != hosts) { - opal_argv_free(hosts); - } return ORTE_SUCCESS; } diff --git a/orte/mca/ess/tm/ess_tm_module.c b/orte/mca/ess/tm/ess_tm_module.c index 0ebad54b7a4..b9fe8e0cbe6 100644 --- a/orte/mca/ess/tm/ess_tm_module.c +++ b/orte/mca/ess/tm/ess_tm_module.c @@ -67,7 +67,6 @@ static int rte_init(void) { int ret; char *error = NULL; - char **hosts = NULL; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -82,21 +81,11 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (NULL != orte_node_regex) { - /* extract the nodes */ - if (ORTE_SUCCESS != (ret = - orte_regex_extract_node_names(orte_node_regex, &hosts)) || - NULL == hosts) { - error = "orte_regex_extract_node_names"; - goto error; - } - } - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - opal_argv_free(hosts); return ORTE_SUCCESS; } @@ -194,4 +183,3 @@ static int tm_set_name(void) return ORTE_SUCCESS; } - diff --git a/orte/mca/grpcomm/direct/grpcomm_direct.c b/orte/mca/grpcomm/direct/grpcomm_direct.c index e69068a711a..0621d5db124 100644 --- a/orte/mca/grpcomm/direct/grpcomm_direct.c +++ b/orte/mca/grpcomm/direct/grpcomm_direct.c @@ -270,7 +270,7 @@ static void xcast_recv(int status, orte_process_name_t* sender, opal_list_t coll; orte_grpcomm_signature_t *sig; orte_rml_tag_t tag; - char *rtmod; + char *rtmod, *nidmap; size_t inlen, cmplen; uint8_t *packed_data, *cmpdata; @@ -392,7 +392,8 @@ static void xcast_recv(int status, orte_process_name_t* sender, } opal_dss.copy_payload(relay, data); } else if (ORTE_DAEMON_ADD_LOCAL_PROCS == command || - ORTE_DAEMON_DVM_NIDMAP_CMD == command) { + ORTE_DAEMON_DVM_NIDMAP_CMD == command || + ORTE_DAEMON_DVM_ADD_PROCS == command) { /* setup our internal relay buffer */ relay = OBJ_NEW(opal_buffer_t); /* repack the command */ @@ -400,14 +401,25 @@ static void xcast_recv(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(ret); goto relay; } - /* see if any daemons were launched */ + /* unpack the nidmap string - may be NULL */ + cnt = 1; + if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &nidmap, &cnt, OPAL_STRING))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + if (NULL != nidmap) { + if (ORTE_SUCCESS != (ret = orte_util_nidmap_parse(nidmap))) { + ORTE_ERROR_LOG(ret); + goto relay; + } + free(nidmap); + } + /* see if they included info on node capabilities */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { ORTE_ERROR_LOG(ret); goto relay; } - /* add it to our relay buffer as we will need it later */ - opal_dss.pack(relay, &flag, 1, OPAL_INT8); if (0 != flag) { /* update our local nidmap, if required - the decode function * knows what to do @@ -544,7 +556,7 @@ static void xcast_recv(int status, orte_process_name_t* sender, /* now pass the relay buffer to myself for processing - don't * inject it into the RML system via send as that will compete * with the relay messages down in the OOB. Instead, pass it - * directly to the orted command processor */ + * directly to the RML message processor */ if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) { ORTE_RML_POST_MESSAGE(ORTE_PROC_MY_NAME, tag, 1, relay->base_ptr, relay->bytes_used); diff --git a/orte/mca/odls/alps/odls_alps_module.c b/orte/mca/odls/alps/odls_alps_module.c index 7d0e15d6f69..9d17521b440 100644 --- a/orte/mca/odls/alps/odls_alps_module.c +++ b/orte/mca/odls/alps/odls_alps_module.c @@ -16,6 +16,8 @@ * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Rutgers, The State University of New Jersey. + * All rights reserved. * * $COPYRIGHT$ * @@ -423,6 +425,20 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd) sigprocmask(0, 0, &sigs); sigprocmask(SIG_UNBLOCK, &sigs, 0); + /* take us to the correct wdir */ + if (NULL != cd->wdir) { + if (0 != chdir(cd->wdir)) { + send_error_show_help(write_fd, 1, + "help-orterun.txt", + "orterun:wdir-not-found", + "orted", + cd->wdir, + orte_process_info.nodename, + (NULL == cd->child) ? 0 : cd->child->app_rank); + /* Does not return */ + } + } + /* Exec the new executable */ if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) { diff --git a/orte/mca/odls/base/help-orte-odls-base.txt b/orte/mca/odls/base/help-orte-odls-base.txt index cde63e5cfd6..29c83dbb1bd 100644 --- a/orte/mca/odls/base/help-orte-odls-base.txt +++ b/orte/mca/odls/base/help-orte-odls-base.txt @@ -6,6 +6,7 @@ # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2014 Research Organization for Information Science # and Technology (RIST). All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -46,6 +47,7 @@ Will continue attempting to launch the process. The xterm option was asked to display a rank that is larger than the number of procs in the job: +Node: %s Rank: %d Num procs: %d diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index f822e936b21..30462ac4faa 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -113,6 +113,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, int8_t flag; void *nptr; uint32_t key; + char *nidmap; /* get the job data pointer */ if (NULL == (jdata = orte_get_job_data_object(job))) { @@ -127,19 +128,32 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, return ORTE_SUCCESS; } - /* if we launched new daemons... */ - if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) { - /* flag that we did */ + /* if we couldn't provide the allocation regex on the orted + * cmd line, then we need to provide all the info here */ + if (!orte_nidmap_communicated) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) { + ORTE_ERROR_LOG(rc); + return rc; + } + orte_nidmap_communicated = true; + } else { + nidmap = NULL; + } + opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING); + if (NULL != nidmap) { + free(nidmap); + } + + /* if we haven't already done so, provide the info on the + * capabilities of each node */ + if (!orte_node_info_communicated || + orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) { flag = 1; opal_dss.pack(buffer, &flag, 1, OPAL_INT8); - - /* include a nodemap of the daemons */ if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(buffer))) { ORTE_ERROR_LOG(rc); return rc; } - - /* if we are not using static ports, we need to send the wireup info */ if (!orte_static_ports && !orte_fwd_mpirun_port) { /* pack a flag indicating wiring info is provided */ flag = 1; @@ -176,41 +190,52 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, * copy of all active jobs so the grpcomm collectives can * properly work should a proc from one of the other jobs * interact with this one */ - OBJ_CONSTRUCT(&jobdata, opal_buffer_t); - numjobs = 0; - rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr); - while (OPAL_SUCCESS == rc) { - /* skip the one we are launching now */ - if (NULL != jptr && jptr != jdata && - ORTE_PROC_MY_NAME->jobid != jptr->jobid) { - /* pack the job struct */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&jobdata); - return rc; + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) { + flag = 1; + opal_dss.pack(buffer, &flag, 1, OPAL_INT8); + OBJ_CONSTRUCT(&jobdata, opal_buffer_t); + numjobs = 0; + rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr); + while (OPAL_SUCCESS == rc) { + /* skip the one we are launching now */ + if (NULL != jptr && jptr != jdata && + ORTE_PROC_MY_NAME->jobid != jptr->jobid) { + /* pack the job struct */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&jobdata); + return rc; + } + ++numjobs; } - ++numjobs; + rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr); } - rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr); - } - /* pack the number of jobs */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &numjobs, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&jobdata); - return rc; - } - if (0 < numjobs) { - /* pack the jobdata buffer */ - wireup = &jobdata; - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &wireup, 1, OPAL_BUFFER))) { + /* pack the number of jobs */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &numjobs, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&jobdata); return rc; } - OBJ_DESTRUCT(&jobdata); + if (0 < numjobs) { + /* pack the jobdata buffer */ + wireup = &jobdata; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &wireup, 1, OPAL_BUFFER))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&jobdata); + return rc; + } + OBJ_DESTRUCT(&jobdata); + } + } else { + flag = 0; + opal_dss.pack(buffer, &flag, 1, OPAL_INT8); } + orte_node_info_communicated = true; } else { - /* include a sentinel */ + /* mark that we didn't */ + flag = 0; + opal_dss.pack(buffer, &flag, 1, OPAL_INT8); + /* and that we didn't launch daemons */ flag = 0; opal_dss.pack(buffer, &flag, 1, OPAL_INT8); } @@ -671,7 +696,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) } /* did the user request we display output in xterms? */ - if (NULL != orte_xterm) { + if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { opal_list_item_t *nmitem; orte_namelist_t *nm; /* see if this rank is one of those requested */ @@ -717,9 +742,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) for (i=0; NULL != app->argv[i]; i++) { opal_argv_append_nosize(&cd->argv, app->argv[i]); } - /* the app exe name itself is in the argvsav array, so - * we can recover it from there later - */ cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL); if (NULL == cd->cmd) { orte_show_help("help-orte-odls-base.txt", @@ -743,7 +765,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) } /* if we are indexing the argv by rank, do so now */ - if (cd->index_argv) { + if (cd->index_argv && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { char *param; asprintf(¶m, "%s-%d", cd->argv[0], (int)child->name.vpid); free(cd->argv[0]); @@ -1115,10 +1137,15 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata) } if (NULL != effective_dir) { free(effective_dir); + effective_dir = NULL; } } GETOUT: + if (NULL != effective_dir) { + free(effective_dir); + effective_dir = NULL; + } /* tell the state machine that all local procs for this job * were launched so that it can do whatever it needs to do, * like send a state update message for all procs to the HNP @@ -1586,6 +1613,12 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, * has happened */ ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID); + + /* Since we are not going to wait for this process, make sure + * we mark it as not-alive so that we don't wait for it + * in orted_cmd + */ + ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE); cd->child->pid = 0; /* mark the child as "killed" */ @@ -1718,6 +1751,9 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child, /* setup the path */ if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) { ORTE_ERROR_LOG(rc); + if (NULL != wdir) { + free(wdir); + } goto CLEANUP; } @@ -1774,12 +1810,6 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child, opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); - if (ORTE_SUCCESS != (rc = fork_local(cd))) { - orte_wait_cb_cancel(child); - child->exit_code = ORTE_ERR_SILENT; /* error message already output */ - ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START); - } - CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, "%s odls:restart of proc %s %s", diff --git a/orte/mca/odls/default/help-orte-odls-default.txt b/orte/mca/odls/default/help-orte-odls-default.txt index 0e5d526e13f..06181b7c960 100644 --- a/orte/mca/odls/default/help-orte-odls-default.txt +++ b/orte/mca/odls/default/help-orte-odls-default.txt @@ -12,6 +12,7 @@ # All rights reserved. # Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -29,6 +30,7 @@ having specified a directory for your application. Your job will now abort. Local host: %s + Working dir: %s Application name: %s Error: %s # diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 5ad54b93fb3..c95946d4193 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -16,6 +16,8 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Rutgers, The State University of New Jersey. + * All rights reserved. * * $COPYRIGHT$ * @@ -416,7 +418,16 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd) /* take us to the correct wdir */ if (NULL != cd->wdir) { - chdir(cd->wdir); + if (0 != chdir(cd->wdir)) { + send_error_show_help(write_fd, 1, + "help-orterun.txt", + "orterun:wdir-not-found", + "orted", + cd->wdir, + orte_process_info.nodename, + (NULL == cd->child) ? 0 : cd->child->app_rank); + /* Does not return */ + } } /* Exec the new executable */ diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index 82cef3ff4c1..1362b1b6332 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -89,6 +89,9 @@ typedef uint8_t orte_daemon_cmd_flag_t; /* request full topology string */ #define ORTE_DAEMON_REPORT_TOPOLOGY_CMD (orte_daemon_cmd_flag_t) 33 +/* tell DVM daemons to cleanup resources from job */ +#define ORTE_DAEMON_DVM_CLEANUP_JOB_CMD (orte_daemon_cmd_flag_t) 34 + /* * Struct written up the pipe from the child to the parent. */ diff --git a/orte/mca/oob/base/base.h b/orte/mca/oob/base/base.h index 21595f26ad4..322ba0be1b1 100644 --- a/orte/mca/oob/base/base.h +++ b/orte/mca/oob/base/base.h @@ -49,8 +49,6 @@ BEGIN_C_DECLS -OPAL_TIMING_DECLARE_EXT(ORTE_DECLSPEC, tm_oob) - /* * Convenience Typedef */ diff --git a/orte/mca/oob/base/oob_base_frame.c b/orte/mca/oob/base/oob_base_frame.c index a20d20010e1..56ec2ad8fc0 100644 --- a/orte/mca/oob/base/oob_base_frame.c +++ b/orte/mca/oob/base/oob_base_frame.c @@ -52,8 +52,6 @@ * Global variables */ orte_oob_base_t orte_oob_base = {0}; -OPAL_TIMING_DECLARE(tm_oob) - static int orte_oob_base_register(mca_base_register_flag_t flags) { @@ -109,9 +107,6 @@ static int orte_oob_base_close(void) OBJ_DESTRUCT(&orte_oob_base.peers); - OPAL_TIMING_EVENT((&tm_oob, "Finish")); - OPAL_TIMING_REPORT(orte_oob_base.timing, &tm_oob); - return mca_base_framework_components_close(&orte_oob_base_framework, NULL); } @@ -137,8 +132,6 @@ static int orte_oob_base_open(mca_base_open_flag_t flags) orte_state.add_job_state(ORTE_JOB_STATE_FT_RESTART, orte_oob_base_ft_event, ORTE_ERROR_PRI); #endif - OPAL_TIMING_INIT(&tm_oob); - /* Open up all available components */ return mca_base_framework_components_open(&orte_oob_base_framework, flags); } diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 4a54ff146a1..27810ec2457 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -240,17 +240,17 @@ static int tcp_component_register(void) MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_tcp_component.max_retries); - mca_oob_tcp_component.tcp_sndbuf = 128 * 1024; + mca_oob_tcp_component.tcp_sndbuf = 0; (void)mca_base_component_var_register(component, "sndbuf", - "TCP socket send buffering size (in bytes)", + "TCP socket send buffering size (in bytes, 0 => leave system default)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_tcp_component.tcp_sndbuf); - mca_oob_tcp_component.tcp_rcvbuf = 128 * 1024; + mca_oob_tcp_component.tcp_rcvbuf = 0; (void)mca_base_component_var_register(component, "rcvbuf", - "TCP socket receive buffering size (in bytes)", + "TCP socket receive buffering size (in bytes, 0 => leave system default)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_LOCAL, diff --git a/orte/mca/oob/tcp/oob_tcp_sendrecv.c b/orte/mca/oob/tcp/oob_tcp_sendrecv.c index 1d7f9f87a4e..70a4c134128 100644 --- a/orte/mca/oob/tcp/oob_tcp_sendrecv.c +++ b/orte/mca/oob/tcp/oob_tcp_sendrecv.c @@ -112,9 +112,6 @@ static int send_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_send_t* msg) int iov_count, retries = 0; ssize_t remain = msg->sdbytes, rc; - OPAL_TIMING_EVENT((&tm_oob, "to %s %d bytes", - ORTE_NAME_PRINT(&(peer->name)), msg->sdbytes)); - iov[0].iov_base = msg->sdptr; iov[0].iov_len = msg->sdbytes; if (!msg->hdr_sent) { @@ -341,9 +338,6 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata) static int read_bytes(mca_oob_tcp_peer_t* peer) { int rc; -#if OPAL_ENABLE_TIMING - int to_read = peer->recv_msg->rdbytes; -#endif /* read until all bytes recvd or error */ while (0 < peer->recv_msg->rdbytes) { @@ -415,9 +409,6 @@ static int read_bytes(mca_oob_tcp_peer_t* peer) peer->recv_msg->rdptr += rc; } - OPAL_TIMING_EVENT((&tm_oob, "from %s %d bytes", - ORTE_NAME_PRINT(&(peer->name)), to_read)); - /* we read the full data block */ return ORTE_SUCCESS; } @@ -432,9 +423,6 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata; int rc; orte_rml_send_t *snd; -#if OPAL_ENABLE_TIMING - bool timing_same_as_hdr = false; -#endif opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler called for peer %s", @@ -504,15 +492,7 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler read hdr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); -#if OPAL_ENABLE_TIMING - int to_recv = peer->recv_msg->rdbytes; -#endif if (ORTE_SUCCESS == (rc = read_bytes(peer))) { -#if OPAL_ENABLE_TIMING - timing_same_as_hdr = true; -#endif - OPAL_TIMING_EVENT((&tm_oob, "from %s %d bytes [header]", - ORTE_NAME_PRINT(&(peer->name)), to_recv)); /* completed reading the header */ peer->recv_msg->hdr_recvd = true; /* convert the header */ @@ -565,11 +545,6 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), peer->recv_msg->hdr.tag); - OPAL_TIMING_EVENT((&tm_oob, "from %s %d bytes [body:%s]", - ORTE_NAME_PRINT(&(peer->name)), - (int)peer->recv_msg->hdr.nbytes, - (timing_same_as_hdr) ? "same" : "next")); - /* am I the intended recipient (header was already converted back to host order)? */ if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { diff --git a/orte/mca/plm/alps/help-plm-alps.txt b/orte/mca/plm/alps/help-plm-alps.txt index f109299a862..c0e3d0470fb 100644 --- a/orte/mca/plm/alps/help-plm-alps.txt +++ b/orte/mca/plm/alps/help-plm-alps.txt @@ -10,6 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -39,7 +40,3 @@ the map for this application. This can be caused by a lack of an allocation, or by an error in the Open MPI code. Please check to ensure you have a ALPS allocation. If you do, then please pass the error to the Open MPI user's mailing list for assistance. -# -[slurm-not-supported] -mpirun is not a supported launcher on Cray XC using Native SLURM. -srun must be used to launch jobs on these systems. diff --git a/orte/mca/plm/alps/plm_alps.h b/orte/mca/plm/alps/plm_alps.h index d15ae07ffa0..bdc039fedaf 100644 --- a/orte/mca/plm/alps/plm_alps.h +++ b/orte/mca/plm/alps/plm_alps.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,7 +47,6 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t mca_plm_alps_component; ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_alps_module; -extern bool mca_plm_alps_using_aprun; END_C_DECLS diff --git a/orte/mca/plm/alps/plm_alps_component.c b/orte/mca/plm/alps/plm_alps_component.c index e474cd59130..f906a5cb1be 100644 --- a/orte/mca/plm/alps/plm_alps_component.c +++ b/orte/mca/plm/alps/plm_alps_component.c @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,7 +44,6 @@ */ const char *mca_plm_alps_component_version_string = "Open MPI alps plm MCA component version " ORTE_VERSION; -bool mca_plm_alps_using_aprun = {true}; /* @@ -158,7 +158,11 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori } if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) { - mca_plm_alps_using_aprun = false; + /* we are in a Cray SLURM environment, so we don't want + * this plm component */ + *priority = 0; + *module = NULL; + return ORTE_ERROR; } #endif diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 25499442124..61b1c32dba6 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -121,23 +121,6 @@ static int plm_alps_init(void) return rc; } - /* - * owing to way the SLURM PLM component works, we can't use - * it on Cray XC systems as currently designed. The problem - * is the MPI processes launched on the head node (where the - * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon - * (mpirun) which is not a child of a slurmd daemon. This - * means that any RDMA credentials obtained via the odls/alps - * local launcher are incorrect. - * - * So for now, we just don't support mpirun launched jobs - * on Cray XC systems using Native SLURM. - */ - if (false == mca_plm_alps_using_aprun) { - orte_show_help("help-plm-alps.txt", "slurm-not-supported", true); - exit(-1); - } - if (orte_do_not_launch) { /* must map daemons since we won't be launching them */ orte_plm_globals.daemon_nodes_assigned_at_launch = true; @@ -306,41 +289,42 @@ static void launch_daemons(int fd, short args, void *cbdata) opal_argv_append(&argc, &argv, "-e"); opal_argv_append(&argc, &argv, "OMPI_NO_USE_CRAY_PMI=1"); - /* create nodelist */ - nodelist_argv = NULL; - nodelist_argc = 0; - - for (nnode=0; nnode < map->nodes->size; nnode++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { - continue; - } - - /* if the daemon already exists on this node, then - * don't include it - */ - if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { - continue; - } - - /* otherwise, add it to the list of nodes upon which - * we need to launch a daemon - */ - opal_argv_append(&nodelist_argc, &nodelist_argv, node->name); - } - if (0 == opal_argv_count(nodelist_argv)) { - orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true); - rc = ORTE_ERR_FAILED_TO_START; - goto cleanup; - } - nodelist_flat = opal_argv_join(nodelist_argv, ','); - opal_argv_free(nodelist_argv); - /* if we are using all allocated nodes, then alps * doesn't need a nodelist, or if running without a batch scheduler */ if ((map->num_new_daemons < orte_num_allocated_nodes) || (orte_num_allocated_nodes == 0)) { + /* create nodelist */ + nodelist_argv = NULL; + nodelist_argc = 0; + + for (nnode=0; nnode < map->nodes->size; nnode++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { + continue; + } + + /* if the daemon already exists on this node, then + * don't include it + */ + if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { + continue; + } + + /* otherwise, add it to the list of nodes upon which + * we need to launch a daemon + */ + opal_argv_append(&nodelist_argc, &nodelist_argv, node->name); + } + if (0 == opal_argv_count(nodelist_argv)) { + orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true); + rc = ORTE_ERR_FAILED_TO_START; + goto cleanup; + } + nodelist_flat = opal_argv_join(nodelist_argv, ','); + opal_argv_free(nodelist_argv); + opal_argv_append(&argc, &argv, "-L"); opal_argv_append(&argc, &argv, nodelist_flat); + free(nodelist_flat); } @@ -351,20 +335,10 @@ static void launch_daemons(int fd, short args, void *cbdata) /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); - /* ensure that mpirun is - * on the list. Since alps won't be launching a daemon on it, - * it won't have been placed on the list, so create a new - * version here that includes it */ - asprintf(<mp, "%s,%s", orte_process_info.nodename, nodelist_flat); - free(nodelist_flat); - nodelist_flat = ltmp; - /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, NULL, - &proc_vpid_index, - nodelist_flat); - free(nodelist_flat); + &proc_vpid_index); /* tell the new daemons the base of the name list so they can compute * their own name on the other end diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index a9dbc4f0416..49890762f2f 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -14,7 +14,7 @@ * et Automatique. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2016 Research Organization for Information Science + * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -817,6 +817,10 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender, int i; uint32_t h; orte_job_t *jdata; + uint8_t flag; + size_t inlen, cmplen; + uint8_t *packed_data, *cmpdata; + opal_buffer_t datbuf, *data; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:daemon_topology recvd for daemon %s", @@ -832,10 +836,55 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender, orted_failed_launch = true; goto CLEANUP; } + OBJ_CONSTRUCT(&datbuf, opal_buffer_t); + /* unpack the flag to see if this payload is compressed */ + idx=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + if (flag) { + /* unpack the data size */ + idx=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &inlen, &idx, OPAL_SIZE))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + /* unpack the unpacked data size */ + idx=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &cmplen, &idx, OPAL_SIZE))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + /* allocate the space */ + packed_data = (uint8_t*)malloc(inlen); + /* unpack the data blob */ + idx = inlen; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, packed_data, &idx, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + /* decompress the data */ + if (orte_util_uncompress_block(&cmpdata, cmplen, + packed_data, inlen)) { + /* the data has been uncompressed */ + opal_dss.load(&datbuf, cmpdata, cmplen); + data = &datbuf; + } else { + data = buffer; + } + free(packed_data); + } else { + data = buffer; + } /* unpack the topology signature for this node */ idx=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &idx, OPAL_STRING))) { + if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &sig, &idx, OPAL_STRING))) { ORTE_ERROR_LOG(rc); orted_failed_launch = true; goto CLEANUP; @@ -861,7 +910,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender, /* unpack the topology */ idx=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) { + if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &topo, &idx, OPAL_HWLOC_TOPO))) { ORTE_ERROR_LOG(rc); orted_failed_launch = true; goto CLEANUP; @@ -873,7 +922,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender, /* unpack any coprocessors */ idx=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &coprocessors, &idx, OPAL_STRING))) { + if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &coprocessors, &idx, OPAL_STRING))) { ORTE_ERROR_LOG(rc); orted_failed_launch = true; goto CLEANUP; @@ -900,7 +949,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender, } /* see if this daemon is on a coprocessor */ idx=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &coprocessors, &idx, OPAL_STRING))) { + if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &coprocessors, &idx, OPAL_STRING))) { ORTE_ERROR_LOG(rc); orted_failed_launch = true; goto CLEANUP; @@ -1037,20 +1086,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&daemon->name), nodename)); - /* look this node up, if necessary */ - if (!orte_plm_globals.daemon_nodes_assigned_at_launch) { - OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, - "%s plm:base:orted_report_launch attempting to assign daemon %s to node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&dname), nodename)); - /* to "relocate" the daemon, we just update the name of - * the node object pointed to by this daemon */ - free(daemon->node->name); - daemon->node->name = strdup(nodename); - /* mark that it was verified */ - ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_LOC_VERIFIED); - } - /* mark the daemon as launched */ ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_DAEMON_LAUNCHED); @@ -1102,8 +1137,57 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, /* rank=1 always sends its topology back */ topo = NULL; if (1 == dname.vpid) { + uint8_t flag; + size_t inlen, cmplen; + uint8_t *packed_data, *cmpdata; + opal_buffer_t datbuf, *data; + OBJ_CONSTRUCT(&datbuf, opal_buffer_t); + /* unpack the flag to see if this payload is compressed */ + idx=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT8))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + if (flag) { + /* unpack the data size */ + idx=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &inlen, &idx, OPAL_SIZE))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + /* unpack the unpacked data size */ + idx=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &cmplen, &idx, OPAL_SIZE))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + /* allocate the space */ + packed_data = (uint8_t*)malloc(inlen); + /* unpack the data blob */ + idx = inlen; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, packed_data, &idx, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + orted_failed_launch = true; + goto CLEANUP; + } + /* decompress the data */ + if (orte_util_uncompress_block(&cmpdata, cmplen, + packed_data, inlen)) { + /* the data has been uncompressed */ + opal_dss.load(&datbuf, cmpdata, cmplen); + data = &datbuf; + } else { + data = buffer; + } + free(packed_data); + } else { + data = buffer; + } idx=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) { + if (OPAL_SUCCESS != (rc = opal_dss.unpack(data, &topo, &idx, OPAL_HWLOC_TOPO))) { ORTE_ERROR_LOG(rc); orted_failed_launch = true; goto CLEANUP; @@ -1312,8 +1396,7 @@ int orte_plm_base_setup_orted_cmd(int *argc, char ***argv) */ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, char *ess, - int *proc_vpid_index, - char *nodes) + int *proc_vpid_index) { char *param = NULL; const char **tmp_value, **tmp_value2; @@ -1321,7 +1404,6 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, char *tmp_force = NULL; int i, j, cnt, rc; orte_job_t *jdata; - char *rml_uri; unsigned long num_procs; bool ignore; @@ -1411,39 +1493,32 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, param); free(param); - /* pass the uri of the hnp */ - if (ORTE_PROC_IS_HNP) { - rml_uri = orte_rml.get_contact_info(); - } else { - rml_uri = orte_rml.get_contact_info(); - opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(argc, argv, "orte_parent_uri"); - opal_argv_append(argc, argv, rml_uri); - free(rml_uri); - - rml_uri = strdup(orte_process_info.my_hnp_uri); - } - opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(argc, argv, "orte_hnp_uri"); - opal_argv_append(argc, argv, rml_uri); - free(rml_uri); - - /* pass the node list if one was given*/ + /* convert the nodes with daemons to a regex */ param = NULL; - if (NULL != nodes) { - /* convert the nodes to a regex */ - if (ORTE_SUCCESS != (rc = orte_regex_create(nodes, ¶m))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } else if (NULL != orte_node_regex) { - param = strdup(orte_node_regex); + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(¶m))) { + ORTE_ERROR_LOG(rc); + return rc; } - if (NULL != param) { + /* if this is too long, then we'll have to do it with + * a phone home operation instead */ + if (strlen(param) < ORTE_MAX_REGEX_CMD_LENGTH) { opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); opal_argv_append(argc, argv, "orte_node_regex"); opal_argv_append(argc, argv, param); - free(param); + /* mark that the nidmap has been communicated */ + orte_nidmap_communicated = true; + } + free(param); + + if (!orte_static_ports && !orte_fwd_mpirun_port) { + /* if we are using static ports, or we are forwarding + * mpirun's port, then we would have built all the + * connection info and so there is nothing to be passed. + * Otherwise, we have to pass the HNP uri so we can + * phone home */ + opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(argc, argv, "orte_hnp_uri"); + opal_argv_append(argc, argv, orte_process_info.my_hnp_uri); } /* if requested, pass our port */ @@ -1994,7 +2069,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) if (orte_hnp_is_allocated) { node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); OBJ_RETAIN(node); - opal_list_append(&nodes, &node->super); + opal_list_prepend(&nodes, &node->super); } for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { @@ -2028,15 +2103,11 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) } /* ensure we are not on the list */ - for (item = opal_list_get_first(&nodes); - item != opal_list_get_end(&nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - if (0 == node->index) { - opal_list_remove_item(&nodes, item); - OBJ_RELEASE(item); - break; - } + item = opal_list_get_first(&nodes); + node = (orte_node_t*)item; + if (0 == node->index) { + opal_list_remove_item(&nodes, item); + OBJ_RELEASE(item); } /* if we didn't get anything, then we are the only node in the diff --git a/orte/mca/plm/base/plm_private.h b/orte/mca/plm/base/plm_private.h index 835c6de8430..047a508394c 100644 --- a/orte/mca/plm/base/plm_private.h +++ b/orte/mca/plm/base/plm_private.h @@ -114,8 +114,7 @@ ORTE_DECLSPEC void orte_plm_base_recv(int status, orte_process_name_t* sender, */ ORTE_DECLSPEC int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, char *ess_module, - int *proc_vpid_index, - char *nodes); + int *proc_vpid_index); /* * Proxy functions for use by daemons and application procs diff --git a/orte/mca/plm/lsf/plm_lsf_module.c b/orte/mca/plm/lsf/plm_lsf_module.c index df5e0d95002..461feda8684 100644 --- a/orte/mca/plm/lsf/plm_lsf_module.c +++ b/orte/mca/plm/lsf/plm_lsf_module.c @@ -160,7 +160,6 @@ static void launch_daemons(int fd, short args, void *cbdata) int rc; char** env = NULL; char **nodelist_argv; - char *nodelist; int nodelist_argc; char *vpid_string; int i; @@ -257,19 +256,11 @@ static void launch_daemons(int fd, short args, void *cbdata) /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); - /* we need mpirun to be the first node on this list - since we - * aren't launching mpirun via TM, it won't be there now */ - opal_argv_prepend_nosize(&nodelist_argv, orte_process_info.nodename); - nodelist = opal_argv_join(nodelist_argv, ','); - opal_argv_free(nodelist_argv); - /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "lsf", - &proc_vpid_index, - nodelist); - free(nodelist); + &proc_vpid_index); /* tell the new daemons the base of the name list so they can compute * their own name on the other end diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index a8cd21e0022..ac1f501c390 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -328,8 +328,7 @@ static void rsh_wait_daemon(orte_proc_t *daemon, void* cbdata) static int setup_launch(int *argcptr, char ***argvptr, char *nodename, int *node_name_index1, - int *proc_vpid_index, char *prefix_dir, - char *nodelist) + int *proc_vpid_index, char *prefix_dir) { int argc; char **argv; @@ -613,8 +612,7 @@ static int setup_launch(int *argcptr, char ***argvptr, */ orte_plm_base_orted_append_basic_args(&argc, &argv, "env", - proc_vpid_index, - nodelist); + proc_vpid_index); /* ensure that only the ssh plm is selected on the remote daemon */ opal_argv_append_nosize(&argv, "-"OPAL_MCA_CMD_LINE_ID); @@ -828,8 +826,9 @@ static int remote_spawn(opal_buffer_t *launch) } /* setup the launch */ - if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, orte_process_info.nodename, &node_name_index1, - &proc_vpid_index, prefix, NULL))) { + if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, + orte_process_info.nodename, &node_name_index1, + &proc_vpid_index, prefix))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&coll); goto cleanup; @@ -1030,7 +1029,6 @@ static void launch_daemons(int fd, short args, void *cbdata) int port, *portptr; orte_namelist_t *child; char *rtmod; - char *nlistflat; /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched @@ -1199,33 +1197,12 @@ static void launch_daemons(int fd, short args, void *cbdata) orte_routed.get_routing_list(rtmod, &coll); } - /* create a list of all nodes involved so we can pass it along */ - char **nodelist = NULL; - orte_node_t *n2; - for (nnode=0; nnode < map->nodes->size; nnode++) { - if (NULL != (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { - opal_argv_append_nosize(&nodelist, n2->name); - } - } - /* we need mpirun to be the first node on this list */ - if (NULL == nodelist || 0 != strcmp(nodelist[0], orte_process_info.nodename)) { - opal_argv_prepend_nosize(&nodelist, orte_process_info.nodename); - } - nlistflat = opal_argv_join(nodelist, ','); - opal_argv_free(nodelist); - /* setup the launch */ if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1, - &proc_vpid_index, prefix_dir, nlistflat))) { + &proc_vpid_index, prefix_dir))) { ORTE_ERROR_LOG(rc); - if (NULL != nlistflat) { - free(nlistflat); - } goto cleanup; } - if (NULL != nlistflat) { - free(nlistflat); - } /* * Iterate through each of the nodes diff --git a/orte/mca/plm/slurm/help-plm-slurm.txt b/orte/mca/plm/slurm/help-plm-slurm.txt index 8c450c0a283..837c3e88a89 100644 --- a/orte/mca/plm/slurm/help-plm-slurm.txt +++ b/orte/mca/plm/slurm/help-plm-slurm.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -49,3 +49,18 @@ are running. Please consult with your system administrator about obtaining such support. +[no-local-support] +The SLURM process starter cannot start processes local to +mpirun when executing under a Cray environment. The problem +is that mpirun is not itself a child of a slurmd daemon. Thus, +any processes mpirun itself starts will inherit incorrect +RDMA credentials. + +Your application will be mapped and run (assuming adequate +resources) on the remaining allocated nodes. If adequate +resources are not available, you will need to exit and obtain +a larger allocation. + +This situation will be fixed in a future release. Meantime, +you can turn "off" this warning by setting the plm_slurm_warning +MCA param to 0. diff --git a/orte/mca/plm/slurm/plm_slurm.h b/orte/mca/plm/slurm/plm_slurm.h index eae239edf07..1e88ef60a84 100644 --- a/orte/mca/plm/slurm/plm_slurm.h +++ b/orte/mca/plm/slurm/plm_slurm.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,6 +30,7 @@ BEGIN_C_DECLS struct orte_plm_slurm_component_t { orte_plm_base_component_t super; char *custom_args; + bool slurm_warning_msg; }; typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t; diff --git a/orte/mca/plm/slurm/plm_slurm_component.c b/orte/mca/plm/slurm/plm_slurm_component.c index 90d14dd24c7..3e29bd46231 100644 --- a/orte/mca/plm/slurm/plm_slurm_component.c +++ b/orte/mca/plm/slurm/plm_slurm_component.c @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,7 +29,9 @@ #include "orte_config.h" #include "orte/constants.h" +#include "opal/util/opal_environ.h" #include "orte/util/name_fns.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/plm/plm.h" @@ -99,6 +102,13 @@ static int plm_slurm_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_plm_slurm_component.custom_args); + mca_plm_slurm_component.slurm_warning_msg = true; + (void) mca_base_component_var_register (comp, "warning", "Turn off warning message", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_plm_slurm_component.slurm_warning_msg); + return ORTE_SUCCESS; } diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index 75bebac4726..fc62b057f3b 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -65,7 +65,7 @@ #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_quit.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/rmaps/base/base.h" #include "orte/mca/state/state.h" #include "orte/orted/orted.h" @@ -193,6 +193,25 @@ static void launch_daemons(int fd, short args, void *cbdata) "%s plm:slurm: LAUNCH DAEMONS CALLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); +#if SLURM_CRAY_ENV + /* if we are in a Cray-SLURM environment, then we cannot + * launch procs local to the HNP. The problem + * is the MPI processes launched on the head node (where the + * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon + * (mpirun) which is not a child of a slurmd daemon. This + * means that any RDMA credentials obtained via the odls/alps + * local launcher are incorrect. So warn the user and set + * the envar for no_schedule_local if mpirun is not on a + * system management node (i.e. is part of the allocation) + * and the "no_use_local" flag hasn't been set */ + if (mca_plm_slurm_component.slurm_warning_msg && + (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) { + orte_show_help("help-plm-slurm.txt", "no-local-support", true); + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); + mca_plm_slurm_component.slurm_warning_msg = false; // only do this once + } +#endif + /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ @@ -323,6 +342,7 @@ static void launch_daemons(int fd, short args, void *cbdata) goto cleanup; } nodelist_flat = opal_argv_join(nodelist_argv, ','); + opal_argv_free(nodelist_argv); /* if we are using all allocated nodes, then srun doesn't * require any further arguments @@ -345,6 +365,7 @@ static void launch_daemons(int fd, short args, void *cbdata) OPAL_OUTPUT_VERBOSE((2, orte_plm_base_framework.framework_output, "%s plm:slurm: launching on nodes %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat)); + free(nodelist_flat); /* * ORTED OPTIONS @@ -353,18 +374,9 @@ static void launch_daemons(int fd, short args, void *cbdata) /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); - /* we need mpirun to be the first node on this list - since we - * aren't launching mpirun via srun, it won't be there now */ - opal_argv_prepend_nosize(&nodelist_argv, orte_process_info.nodename); - free(nodelist_flat); - nodelist_flat = opal_argv_join(nodelist_argv, ','); - opal_argv_free(nodelist_argv); - /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, - "slurm", &proc_vpid_index, - nodelist_flat); - free(nodelist_flat); + "slurm", &proc_vpid_index); /* tell the new daemons the base of the name list so they can compute * their own name on the other end diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index e3e0c422da8..915d78aa0ea 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -171,7 +171,6 @@ static void launch_daemons(int fd, short args, void *cbdata) char **env = NULL; char *var; char **argv = NULL; - char **nodeargv; int argc = 0; int rc; orte_std_cntr_t i; @@ -180,7 +179,6 @@ static void launch_daemons(int fd, short args, void *cbdata) tm_task_id *tm_task_ids = NULL; bool failed_launch = true; mode_t current_umask; - char *nodelist; char* vpid_string; orte_job_t *daemons, *jdata; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; @@ -260,33 +258,9 @@ static void launch_daemons(int fd, short args, void *cbdata) /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); - /* create a list of nodes in this launch */ - nodeargv = NULL; - for (i = 0; i < map->nodes->size; i++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { - continue; - } - - /* if this daemon already exists, don't launch it! */ - if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { - continue; - } - - /* add to list */ - opal_argv_append_nosize(&nodeargv, node->name); - } - /* we need mpirun to be the first node on this list - since we - * aren't launching mpirun via TM, it won't be there now */ - opal_argv_prepend_nosize(&nodeargv, orte_process_info.nodename); - nodelist = opal_argv_join(nodeargv, ','); - opal_argv_free(nodeargv); - - /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "tm", - &proc_vpid_index, - nodelist); - free(nodelist); + &proc_vpid_index); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); diff --git a/orte/mca/ras/alps/ras_alps_module.c b/orte/mca/ras/alps/ras_alps_module.c index 681c80fc9fc..a8273dfd3ca 100644 --- a/orte/mca/ras/alps/ras_alps_module.c +++ b/orte/mca/ras/alps/ras_alps_module.c @@ -365,25 +365,6 @@ ras_alps_getline(FILE *fp) return NULL; } -static int compare_nodes (opal_list_item_t **a, opal_list_item_t **b) -{ - orte_node_t *nodea = (orte_node_t *) *a; - orte_node_t *nodeb = (orte_node_t *) *b; - int32_t launcha, launchb, *ldptr; - - ldptr = &launcha; - if (!orte_get_attribute(&nodea->attributes, ORTE_NODE_LAUNCH_ID, (void**)&ldptr, OPAL_INT32)) { - return 0; - } - - ldptr = &launchb; - if (!orte_get_attribute(&nodeb->attributes, ORTE_NODE_LAUNCH_ID, (void**)&ldptr, OPAL_INT32)) { - return 0; - } - - return (launcha > launchb) ? 1 : -1; -} - #if ALPS_APPINFO_VERSION > 0 && ALPS_APPINFO_VERSION < 3 typedef placeNodeList_t orte_ras_alps_placeNodeList_t; #else @@ -602,8 +583,6 @@ orte_ras_alps_read_appinfo_file(opal_list_t *nodes, char *filename, break; /* Extended details ignored */ } - opal_list_sort (nodes, compare_nodes); - free(cpBuf); /* Free the buffer */ return ORTE_SUCCESS; @@ -617,4 +596,3 @@ orte_ras_alps_finalize(void) "ras:alps:finalize: success (nothing to do)"); return ORTE_SUCCESS; } - diff --git a/orte/mca/ras/loadleveler/Makefile.am b/orte/mca/ras/loadleveler/Makefile.am deleted file mode 100644 index fb7c1e32fac..00000000000 --- a/orte/mca/ras/loadleveler/Makefile.am +++ /dev/null @@ -1,53 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(ras_loadleveler_CPPFLAGS) - -sources = \ - ras_loadleveler.h \ - ras_loadleveler_component.c \ - ras_loadleveler_module.c - - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_ras_loadleveler_DSO -lib = -lib_sources = -component = mca_ras_loadleveler.la -component_sources = $(sources) -else -lib = libmca_ras_loadleveler.la -lib_sources = $(sources) -component = -component_sources = -endif - -mcacomponentdir = $(ortelibdir) -mcacomponent_LTLIBRARIES = $(component) -mca_ras_loadleveler_la_SOURCES = $(component_sources) -mca_ras_loadleveler_la_LDFLAGS = -module -avoid-version $(ras_loadleveler_LDFLAGS) -mca_ras_loadleveler_la_LIBADD = $(ras_loadleveler_LIBS) - -noinst_LTLIBRARIES = $(lib) -libmca_ras_loadleveler_la_SOURCES = $(lib_sources) -libmca_ras_loadleveler_la_LDFLAGS = -module -avoid-version $(ras_loadleveler_LDFLAGS) -libmca_ras_loadleveler_la_LIBADD = $(ras_loadleveler_LIBS) diff --git a/orte/mca/ras/loadleveler/configure.m4 b/orte/mca/ras/loadleveler/configure.m4 deleted file mode 100644 index 5106ec76e5b..00000000000 --- a/orte/mca/ras/loadleveler/configure.m4 +++ /dev/null @@ -1,40 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_ras_loadleveler_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_ras_loadleveler_CONFIG],[ - AC_CONFIG_FILES([orte/mca/ras/loadleveler/Makefile]) - - ORTE_CHECK_LOADLEVELER([ras_loadleveler], [ras_loadleveler_good=1], [ras_loadleveler_good=0]) - - # if check worked, set wrapper flags if so. - # Evaluate succeed / fail - AS_IF([test "$ras_loadleveler_good" = "1"], - [$1], - [$2]) - - # set build flags to use in makefile - AC_SUBST([ras_loadleveler_CPPFLAGS]) - AC_SUBST([ras_loadleveler_LDFLAGS]) - AC_SUBST([ras_loadleveler_LIBS]) -])dnl diff --git a/orte/mca/ras/loadleveler/owner.txt b/orte/mca/ras/loadleveler/owner.txt deleted file mode 100644 index af4ebbf6a60..00000000000 --- a/orte/mca/ras/loadleveler/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: IBM -status: maintenance diff --git a/orte/mca/ras/loadleveler/ras_loadleveler.h b/orte/mca/ras/loadleveler/ras_loadleveler.h deleted file mode 100644 index 7e4410d167b..00000000000 --- a/orte/mca/ras/loadleveler/ras_loadleveler.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Resource Allocation (Loadleveler) - */ -#ifndef ORTE_RAS_LOADLEVELER_H -#define ORTE_RAS_LOADLEVELER_H - -#include "orte_config.h" -#include "orte/mca/ras/ras.h" -#include "orte/mca/ras/base/base.h" - -BEGIN_C_DECLS - - ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_loadleveler_component; - ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_loadleveler_module; - -END_C_DECLS - -#endif diff --git a/orte/mca/ras/loadleveler/ras_loadleveler_component.c b/orte/mca/ras/loadleveler/ras_loadleveler_component.c deleted file mode 100644 index e7aff9df9a3..00000000000 --- a/orte/mca/ras/loadleveler/ras_loadleveler_component.c +++ /dev/null @@ -1,105 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include "opal/mca/base/base.h" -#include "orte/runtime/orte_globals.h" -#include "orte/util/name_fns.h" - -#include "orte/mca/ras/base/ras_private.h" -#include "ras_loadleveler.h" - - -/* - * Local variables - */ -static int param_priority; - - -/* - * Local functions - */ -static int orte_ras_loadleveler_register(void); -static int orte_ras_loadleveler_open(void); -static int orte_ras_loadleveler_component_query(mca_base_module_t **module, int *priority); - -orte_ras_base_component_t mca_ras_loadleveler_component = { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - .base_version = { - ORTE_RAS_BASE_VERSION_2_0_0, - - /* Component name and version */ - .mca_component_name = "loadleveler", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - - /* Component open and close functions */ - .mca_open_component = orte_ras_loadleveler_open, - .mca_query_component = orte_ras_loadleveler_component_query, - .mca_register_component_params = orte_ras_loadleveler_register, - }, - .base_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, -}; - -static int orte_ras_loadleveler_register(void) -{ - /* for now we set the priority lower then the priority of the POE RAS - * so that it is used whenever the LOADL_PROCESSOR_LIST is actually set */ - param_priority = 90; - (void) mca_base_component_var_register(&mca_ras_loadleveler_component.base_version, - "priority", "Priority of the loadleveler ras component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, ¶m_priority); - - return ORTE_SUCCESS; -} - -static int orte_ras_loadleveler_open(void) -{ - return ORTE_SUCCESS; -} - -static int orte_ras_loadleveler_component_query(mca_base_module_t **module, int *priority) -{ - /* Are we running under a LOADLEVELER job? */ - if (NULL != getenv("LOADL_STEP_ID")) { - *priority = param_priority; - OPAL_OUTPUT_VERBOSE((2, orte_ras_base_framework.framework_output, - "%s ras:loadleveler: available for selection", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - *module = (mca_base_module_t *) &orte_ras_loadleveler_module; - return ORTE_SUCCESS; - } - - /* Sadly, no */ - OPAL_OUTPUT_VERBOSE((2, orte_ras_base_framework.framework_output, - "%s ras:loadleveler: NOT available for selection", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - *module = NULL; - return ORTE_ERROR; -} - diff --git a/orte/mca/ras/loadleveler/ras_loadleveler_module.c b/orte/mca/ras/loadleveler/ras_loadleveler_module.c deleted file mode 100644 index 558a2a133f0..00000000000 --- a/orte/mca/ras/loadleveler/ras_loadleveler_module.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2016 IBM Corporation. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include -#include -#include - -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/util/net.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/runtime/orte_globals.h" -#include "orte/constants.h" - -#include "orte/mca/ras/base/ras_private.h" -#include "ras_loadleveler.h" - - -/* - * Local functions - */ -static int orte_ras_loadleveler_allocate(orte_job_t *jdata, opal_list_t *nodes); -static int orte_ras_loadleveler_finalize(void); - -static int orte_ras_loadleveler_discover(opal_list_t *nodelist); -static int ll_getline(FILE *fp, char *input); - -#define LL_FILE_MAX_LINE_LENGTH 512 - -/* - * Global variable - */ -orte_ras_base_module_t orte_ras_loadleveler_module = { - NULL, - orte_ras_loadleveler_allocate, - NULL, - orte_ras_loadleveler_finalize -}; - - -/* - * Discover available (pre-allocated) nodes. Allocate the - * requested number of nodes/process slots to the job. - */ -static int orte_ras_loadleveler_allocate(orte_job_t *jdata, opal_list_t *nodes) -{ - int ret = ORTE_SUCCESS; - - if (ORTE_SUCCESS != (ret = orte_ras_loadleveler_discover(nodes))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* If we didn't find anything, then this - * is an unrecoverable error - report it - */ - if (opal_list_is_empty(nodes)) { - opal_output(orte_ras_base_framework.framework_output, - "ras:loadleveler:allocate: No nodes were found in the LOADL_HOSTFILE - %s", - getenv("LOADL_HOSTFILE")); - return ORTE_ERR_NOT_FOUND; - } - - return ret; -} - -/* - * There's really nothing to do here - */ -static int orte_ras_loadleveler_finalize(void) -{ - OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, - "ras:loadleveler:finalize: success (nothing to do)")); - return ORTE_SUCCESS; -} - -/** - * Discover the available resources. Obtain directly from LoadLeveler (and - * therefore have no need to validate) -- ignore hostfile or any other - * user-specified parameters. - */ -static int orte_ras_loadleveler_discover(opal_list_t* nodelist) -{ - orte_node_t *node; - opal_list_item_t* item; - FILE *fp; - char *hostname; - char *filename; - char input[LL_FILE_MAX_LINE_LENGTH]; - char *ptr; - - /* Ignore anything that the user already specified -- we're - getting nodes only from LoadLeveler. */ - filename = getenv("LOADL_HOSTFILE"); - if(NULL == filename) { - opal_output(orte_ras_base_framework.framework_output, - "ras:loadleveler:allocate:discover: LOADL_HOSTFILE not set. " - "Unable to discover allocated nodes."); - return ORTE_ERROR; - } - fp = fopen(filename, "r"); - if (NULL == fp) { - ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); - return ORTE_ERR_FILE_OPEN_FAILURE; - } - - /* Iterate through all the nodes and make an entry for each */ - while (0 != ll_getline(fp, input)) { - hostname = strdup(input); - if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hostname) ) { - if (NULL != (ptr = strchr(hostname, '.'))) { - *ptr = '\0'; - } - } - - OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, - "%s ras:loadleveler:allocate:discover: got hostname %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname)); - - /* Remember that LoadLeveler may list the same node more than once. - So we have to check for duplicates. */ - for (item = opal_list_get_first(nodelist); - opal_list_get_end(nodelist) != item; - item = opal_list_get_next(item)) { - node = (orte_node_t*) item; - if (0 == strcmp(node->name, hostname)) { - ++node->slots; - - OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, - "%s ras:loadleveler:allocate:discover: found -- bumped slots to %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots)); - break; - } - } - - /* Did we find it? */ - if (opal_list_get_end(nodelist) == item) { - /* Nope -- didn't find it, so add a new item to the list */ - OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, - "%s ras:loadleveler:allocate:discover: not found -- added to list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - node = OBJ_NEW(orte_node_t); - node->name = hostname; - node->state = ORTE_NODE_STATE_UP; - node->slots_inuse = 0; - node->slots_max = 0; - node->slots = 1; - opal_list_append(nodelist, &node->super); - } else { - /* Yes, so we need to free the hostname that came back */ - free(hostname); - } - } - fclose(fp); - - return ORTE_SUCCESS; -} - -static int ll_getline(FILE *fp, char *input) -{ - char *ret; - - ret = fgets(input, LL_FILE_MAX_LINE_LENGTH, fp); - if (NULL != ret) { - input[strlen(input)-1] = '\0'; /* remove newline */ - return 1; - } - - return 0; -} diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index abf8e8a956a..4bc44bf3b0e 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -477,55 +477,60 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr (int)opal_list_get_size(allocated_nodes))); complete: + num_slots = 0; /* remove all nodes that are already at max usage, and * compute the total number of allocated slots while - * we do so */ - num_slots = 0; - item = opal_list_get_first(allocated_nodes); - while (item != opal_list_get_end(allocated_nodes)) { - /** save the next pointer in case we remove this node */ - next = opal_list_get_next(item); - /** check to see if this node is fully used - remove if so */ - node = (orte_node_t*)item; - if (0 != node->slots_max && node->slots_inuse > node->slots_max) { - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s Removing node %s: max %d inuse %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, node->slots_max, node->slots_inuse)); - opal_list_remove_item(allocated_nodes, item); - OBJ_RELEASE(item); /* "un-retain" it */ - } else if (node->slots <= node->slots_inuse && - (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { - /* remove the node as fully used */ - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s Removing node %s slots %d inuse %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, node->slots, node->slots_inuse)); - opal_list_remove_item(allocated_nodes, item); - OBJ_RELEASE(item); /* "un-retain" it */ - } else if (node->slots > node->slots_inuse) { - /* add the available slots */ + * we do so - can ignore this if we are mapping debugger + * daemons as they do not count against the allocation */ + if (ORTE_MAPPING_DEBUGGER & ORTE_GET_MAPPING_DIRECTIVE(policy)) { + num_slots = opal_list_get_size(allocated_nodes); // tell the mapper there is one slot/node for debuggers + } else { + item = opal_list_get_first(allocated_nodes); + while (item != opal_list_get_end(allocated_nodes)) { + /** save the next pointer in case we remove this node */ + next = opal_list_get_next(item); + /** check to see if this node is fully used - remove if so */ + node = (orte_node_t*)item; + if (0 != node->slots_max && node->slots_inuse > node->slots_max) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s node %s has %d slots available", + "%s Removing node %s: max %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, node->slots - node->slots_inuse)); - num_slots += node->slots - node->slots_inuse; - } else if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { - /* nothing needed to do here - we don't add slots to the - * count as we don't have any available. Just let the mapper - * do what it needs to do to meet the request - */ + node->name, node->slots_max, node->slots_inuse)); + opal_list_remove_item(allocated_nodes, item); + OBJ_RELEASE(item); /* "un-retain" it */ + } else if (node->slots <= node->slots_inuse && + (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { + /* remove the node as fully used */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s node %s is fully used, but available for oversubscrition", + "%s Removing node %s slots %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name)); - } else { - /* if we cannot use it, remove it from list */ - opal_list_remove_item(allocated_nodes, item); - OBJ_RELEASE(item); /* "un-retain" it */ + node->name, node->slots, node->slots_inuse)); + opal_list_remove_item(allocated_nodes, item); + OBJ_RELEASE(item); /* "un-retain" it */ + } else if (node->slots > node->slots_inuse) { + /* add the available slots */ + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, + "%s node %s has %d slots available", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, node->slots - node->slots_inuse)); + num_slots += node->slots - node->slots_inuse; + } else if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { + /* nothing needed to do here - we don't add slots to the + * count as we don't have any available. Just let the mapper + * do what it needs to do to meet the request + */ + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, + "%s node %s is fully used, but available for oversubscription", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); + } else { + /* if we cannot use it, remove it from list */ + opal_list_remove_item(allocated_nodes, item); + OBJ_RELEASE(item); /* "un-retain" it */ + } + /** go on to next item */ + item = next; } - /** go on to next item */ - item = next; } /* Sanity check to make sure we have resources available */ diff --git a/orte/mca/rmaps/ppr/rmaps_ppr.c b/orte/mca/rmaps/ppr/rmaps_ppr.c index 7af292d308e..35285e95cda 100644 --- a/orte/mca/rmaps/ppr/rmaps_ppr.c +++ b/orte/mca/rmaps/ppr/rmaps_ppr.c @@ -327,47 +327,49 @@ static int ppr_mapper(orte_job_t *jdata) } } - /* set the total slots used */ - if ((int)node->num_procs <= node->slots) { - node->slots_inuse = (int)node->num_procs; - } else { - node->slots_inuse = node->slots; - } - - /* if no-oversubscribe was specified, check to see if - * we have violated the total slot specification - regardless, - * if slots_max was given, we are not allowed to violate it! - */ - if ((node->slots < (int)node->num_procs) || - (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { - if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", - true, node->num_procs, app->app); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - rc = ORTE_ERR_SILENT; - goto error; + if (!(ORTE_MAPPING_DEBUGGER & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { + /* set the total slots used */ + if ((int)node->num_procs <= node->slots) { + node->slots_inuse = (int)node->num_procs; + } else { + node->slots_inuse = node->slots; } - /* flag the node as oversubscribed so that sched-yield gets - * properly set + + /* if no-oversubscribe was specified, check to see if + * we have violated the total slot specification - regardless, + * if slots_max was given, we are not allowed to violate it! */ - ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); - ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED); - /* check for permission */ - if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) { - /* if we weren't given a directive either way, then we will error out - * as the #slots were specifically given, either by the host RM or - * via hostfile/dash-host */ - if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { + if ((node->slots < (int)node->num_procs) || + (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { + if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", - true, app->num_procs, app->app); + true, node->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - return ORTE_ERR_SILENT; - } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - /* if we were explicitly told not to oversubscribe, then don't */ - orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", - true, app->num_procs, app->app); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - return ORTE_ERR_SILENT; + rc = ORTE_ERR_SILENT; + goto error; + } + /* flag the node as oversubscribed so that sched-yield gets + * properly set + */ + ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); + ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED); + /* check for permission */ + if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) { + /* if we weren't given a directive either way, then we will error out + * as the #slots were specifically given, either by the host RM or + * via hostfile/dash-host */ + if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { + orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", + true, app->num_procs, app->app); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + return ORTE_ERR_SILENT; + } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { + /* if we were explicitly told not to oversubscribe, then don't */ + orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", + true, app->num_procs, app->app); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + return ORTE_ERR_SILENT; + } } } } diff --git a/orte/mca/rmaps/rmaps_types.h b/orte/mca/rmaps/rmaps_types.h index 127a9d445e6..74f82b6f14c 100644 --- a/orte/mca/rmaps/rmaps_types.h +++ b/orte/mca/rmaps/rmaps_types.h @@ -12,7 +12,7 @@ * Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -91,6 +91,8 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_map_t); /* an error flag */ #define ORTE_MAPPING_CONFLICTED 0x2000 #define ORTE_MAPPING_GIVEN 0x4000 +/* mapping a debugger job */ +#define ORTE_MAPPING_DEBUGGER 0x8000 #define ORTE_SET_MAPPING_DIRECTIVE(target, pol) \ (target) |= (pol) #define ORTE_UNSET_MAPPING_DIRECTIVE(target, pol) \ diff --git a/orte/mca/rml/base/base.h b/orte/mca/rml/base/base.h index ead6f1d10d4..f8cc4b1c0b9 100644 --- a/orte/mca/rml/base/base.h +++ b/orte/mca/rml/base/base.h @@ -55,8 +55,6 @@ BEGIN_C_DECLS -OPAL_TIMING_DECLARE_EXT(ORTE_DECLSPEC, tm_rml) - /* * MCA Framework */ diff --git a/orte/mca/rml/base/rml_base_frame.c b/orte/mca/rml/base/rml_base_frame.c index 61e4f4cd9a4..803bf2db975 100644 --- a/orte/mca/rml/base/rml_base_frame.c +++ b/orte/mca/rml/base/rml_base_frame.c @@ -56,7 +56,6 @@ orte_rml_base_API_t orte_rml = { }; orte_rml_base_t orte_rml_base = {{{0}}}; -OPAL_TIMING_DECLARE(tm_rml) orte_rml_component_t *orte_rml_component = NULL; @@ -136,8 +135,6 @@ static int orte_rml_base_close(void) cleanup(0, 0, NULL); } - OPAL_TIMING_REPORT(orte_rml_base.timing, &tm_rml); - return mca_base_framework_components_close(&orte_rml_base_framework, NULL); } @@ -151,7 +148,6 @@ static int orte_rml_base_open(mca_base_open_flag_t flags) OBJ_CONSTRUCT(&orte_rml_base.conduits, opal_pointer_array_t); opal_pointer_array_init(&orte_rml_base.conduits,1,INT_MAX,1); - OPAL_TIMING_INIT(&tm_rml); /* Open up all available components */ return mca_base_framework_components_open(&orte_rml_base_framework, flags); } diff --git a/orte/mca/rml/base/rml_base_msg_handlers.c b/orte/mca/rml/base/rml_base_msg_handlers.c index 6652f9ad527..0772a5d3a6e 100644 --- a/orte/mca/rml/base/rml_base_msg_handlers.c +++ b/orte/mca/rml/base/rml_base_msg_handlers.c @@ -165,9 +165,6 @@ void orte_rml_base_process_msg(int fd, short flags, void *cbdata) ORTE_NAME_PRINT(&msg->sender), msg->tag)); - OPAL_TIMING_EVENT((&tm_rml,"from %s %d bytes", - ORTE_NAME_PRINT(&msg->sender), msg->iov.iov_len)); - /* if this message is just to warmup the connection, then drop it */ if (ORTE_RML_TAG_WARMUP_CONNECTION == msg->tag) { OBJ_RELEASE(msg); diff --git a/orte/mca/rml/ofi/.opal_ignore b/orte/mca/rml/ofi/.opal_ignore deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/orte/mca/rml/ofi/.opal_unignore b/orte/mca/rml/ofi/.opal_unignore deleted file mode 100644 index 335cd142ab7..00000000000 --- a/orte/mca/rml/ofi/.opal_unignore +++ /dev/null @@ -1,2 +0,0 @@ -anandhis -rhc diff --git a/orte/mca/rml/ofi/Makefile.am b/orte/mca/rml/ofi/Makefile.am deleted file mode 100644 index 0864c15c170..00000000000 --- a/orte/mca/rml/ofi/Makefile.am +++ /dev/null @@ -1,49 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2015-2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(opal_common_libfabric_CPPFLAGS) - -sources = \ - rml_ofi.h \ - rml_ofi_request.h \ - rml_ofi_component.c \ - rml_ofi_send.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_rml_ofi_DSO -component_noinst = -component_install = mca_rml_ofi.la -else -component_noinst = libmca_rml_ofi.la -component_install = -endif - -mcacomponentdir = $(ortelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_rml_ofi_la_SOURCES = $(sources) -mca_rml_ofi_la_LDFLAGS = -module -avoid-version -mca_rml_ofi_la_LIBADD = $(OPAL_TOP_BUILDDIR)/opal/mca/common/libfabric/lib@OPAL_LIB_PREFIX@mca_common_libfabric.la - -noinst_LTLIBRARIES = $(component_noinst) -libmca_rml_ofi_la_SOURCES = $(sources) -libmca_rml_ofi_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/rml/ofi/configure.m4 b/orte/mca/rml/ofi/configure.m4 deleted file mode 100644 index e0e930b4080..00000000000 --- a/orte/mca/rml/ofi/configure.m4 +++ /dev/null @@ -1,29 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2013-2014 Intel, Inc. All rights reserved -# -# Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_orte_rml_ofi_POST_CONFIG(will_build) -# ---------------------------------------- -# Only require the tag if we're actually going to be built - -# MCA_mtl_ofi_CONFIG([action-if-can-compile], -# [action-if-cant-compile]) -# ------------------------------------------------ -AC_DEFUN([MCA_orte_rml_ofi_CONFIG],[ - AC_CONFIG_FILES([orte/mca/rml/ofi/Makefile]) - - # ensure we already ran the common libfabric config - AC_REQUIRE([MCA_opal_common_libfabric_CONFIG]) - - AS_IF([test "$opal_common_libfabric_happy" = "yes"], - [$1], - [$2]) -])dnl diff --git a/orte/mca/rml/ofi/rml_ofi.h b/orte/mca/rml/ofi/rml_ofi.h deleted file mode 100644 index 32332e4f2bd..00000000000 --- a/orte/mca/rml/ofi/rml_ofi.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2015 Intel, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_RML_OFI_RML_OFI_H -#define MCA_RML_OFI_RML_OFI_H - -#include "orte_config.h" - -#include "opal/dss/dss_types.h" -#include "opal/mca/event/event.h" -#include "opal/mca/pmix/pmix.h" -#include "orte/mca/rml/base/base.h" - -#include -#include -#include -#include -#include -#include - -#include "rml_ofi_request.h" - -/** the maximum open OFI ofi_prov - assuming system will have no more than 20 transports*/ -#define MAX_OFI_PROVIDERS 40 -#define RML_OFI_PROV_ID_INVALID 0xFF - -/** RML/OFI key values **/ -/* (char*) ofi socket address (type IN) of the node process is running on */ -#define OPAL_RML_OFI_FI_SOCKADDR_IN "rml.ofi.fisockaddrin" -/* (char*) ofi socket address (type PSM) of the node process is running on */ -#define OPAL_RML_OFI_FI_ADDR_PSMX "rml.ofi.fiaddrpsmx" - -// MULTI_BUF_SIZE_FACTOR defines how large the multi recv buffer will be. -// In order to use FI_MULTI_RECV feature efficiently, we need to have a -// large recv buffer so that we don't need to repost the buffer often to -// get the remaining data when the buffer is full -#define MULTI_BUF_SIZE_FACTOR 128 -#define MIN_MULTI_BUF_SIZE (1024 * 1024) - -#define OFIADDR "ofiaddr" - -#define CLOSE_FID(fd) \ - do { \ - int _ret = 0; \ - if (0 != (fd)) { \ - _ret = fi_close(&(fd)->fid); \ - fd = NULL; \ - if (0 != _ret) { \ - opal_output_verbose(10,orte_rml_base_framework.framework_output, \ - " %s - fi_close failed with error- %d", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ret); \ - } \ - } \ - } while (0); - - -#define RML_OFI_RETRY_UNTIL_DONE(FUNC) \ - do { \ - do { \ - ret = FUNC; \ - if(OPAL_LIKELY(0 == ret)) {break;} \ - } while(-FI_EAGAIN == ret); \ - } while(0); - -BEGIN_C_DECLS - -struct orte_rml_ofi_module_t; - -/** This structure will hold the ep and all ofi objects for each transport -and also the corresponding fi_info -**/ -typedef struct { - - /** ofi provider ID **/ - uint8_t ofi_prov_id; - - /** fi_info for this transport */ - struct fi_info *fabric_info; - - /** Fabric Domain handle */ - struct fid_fabric *fabric; - - /** Access Domain handle */ - struct fid_domain *domain; - - /** Address vector handle */ - struct fid_av *av; - - /** Completion queue handle */ - struct fid_cq *cq; - - /** Endpoint to communicate on */ - struct fid_ep *ep; - - /** Endpoint name */ - char ep_name[FI_NAME_MAX]; - - /** Endpoint name length */ - size_t epnamelen; - - /** OFI memory region */ - struct fid_mr *mr_multi_recv; - - /** buffer for tx and rx */ - void *rxbuf; - - uint64_t rxbuf_size; - - /* event,fd associated with the cq */ - int fd; - - /*event associated with progress fn */ - opal_event_t progress_event; - bool progress_ev_active; - - struct fi_context rx_ctx1; - -} ofi_transport_ofi_prov_t; - - - struct orte_rml_ofi_module_t { - orte_rml_base_module_t api; - - /** current ofi transport id the component is using, this will be initialised - ** in the open_ofi_prov() call **/ - int cur_transport_id; - - /** Fabric info structure of all supported transports in system **/ - struct fi_info *fi_info_list; - - /** OFI ep and corr fi_info for all the transports (ofi_providers) **/ - ofi_transport_ofi_prov_t ofi_prov[MAX_OFI_PROVIDERS]; - - size_t min_ofi_recv_buf_sz; - - /** "Any source" address */ - fi_addr_t any_addr; - - /** number of ofi providers currently opened **/ - uint8_t ofi_prov_open_num; - - /** Unique message id for every message that is fragmented to be sent over OFI **/ - uint32_t cur_msgid; - - /* hashtable stores the peer addresses */ - opal_hash_table_t peers; - - opal_list_t recv_msg_queue_list; - opal_list_t queued_routing_messages; - opal_event_t *timer_event; - struct timeval timeout; -} ; -typedef struct orte_rml_ofi_module_t orte_rml_ofi_module_t; - -typedef struct { - opal_object_t super; - void* ofi_ep; - size_t ofi_ep_len; -} orte_rml_ofi_peer_t; -OBJ_CLASS_DECLARATION(orte_rml_ofi_peer_t); - -ORTE_MODULE_DECLSPEC extern orte_rml_component_t mca_rml_ofi_component; -extern orte_rml_ofi_module_t orte_rml_ofi; - -int orte_rml_ofi_send_buffer_nb(struct orte_rml_base_module_t *mod, - orte_process_name_t* peer, - struct opal_buffer_t* buffer, - orte_rml_tag_t tag, - orte_rml_buffer_callback_fn_t cbfunc, - void* cbdata); -int orte_rml_ofi_send_nb(struct orte_rml_base_module_t *mod, - orte_process_name_t* peer, - struct iovec* iov, - int count, - orte_rml_tag_t tag, - orte_rml_callback_fn_t cbfunc, - void* cbdata); - -/****************** INTERNAL OFI Functions*************/ -void free_ofi_prov_resources( int ofi_prov_id); -void print_provider_list_info (struct fi_info *fi ); -void print_provider_info (struct fi_info *cur_fi ); -int cq_progress_handler(int sd, short flags, void *cbdata); -int get_ofi_prov_id( opal_list_t *attributes); - -/** Send callback */ -int orte_rml_ofi_send_callback(struct fi_cq_data_entry *wc, - orte_rml_ofi_request_t*); - -/** Error callback */ -int orte_rml_ofi_error_callback(struct fi_cq_err_entry *error, - orte_rml_ofi_request_t*); - -/* OFI Recv handler */ -int orte_rml_ofi_recv_handler(struct fi_cq_data_entry *wc, uint8_t ofi_prov_id); - -END_C_DECLS - -#endif diff --git a/orte/mca/rml/ofi/rml_ofi_component.c b/orte/mca/rml/ofi/rml_ofi_component.c deleted file mode 100644 index ab2dc1c4250..00000000000 --- a/orte/mca/rml/ofi/rml_ofi_component.c +++ /dev/null @@ -1,1248 +0,0 @@ -/* - * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/mca/backtrace/backtrace.h" -#include "opal/mca/event/event.h" - -#if OPAL_ENABLE_FT_CR == 1 -#include "orte/mca/rml/rml.h" -#include "orte/mca/state/state.h" -#endif -#include "orte/mca/rml/base/base.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" - -#include "rml_ofi.h" - - -static int rml_ofi_component_open(void); -static int rml_ofi_component_close(void); -static int rml_ofi_component_init(void); -static orte_rml_base_module_t* open_conduit(opal_list_t *attributes); -static orte_rml_pathway_t* query_transports(void); -static char* ofi_get_contact_info(void); -static void process_uri(char *uri); -static void ofi_set_contact_info (const char *uri); -void convert_to_sockaddr( char *ofiuri, struct sockaddr_in* ep_sockaddr); - -/** - * component definition - */ -orte_rml_component_t mca_rml_ofi_component = { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - - .base = { - ORTE_RML_BASE_VERSION_3_0_0, - - .mca_component_name = "ofi", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - .mca_open_component = rml_ofi_component_open, - .mca_close_component = rml_ofi_component_close, - }, - .data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - .priority = 10, - .open_conduit = open_conduit, - .query_transports = query_transports, - .get_contact_info = ofi_get_contact_info, - .set_contact_info = ofi_set_contact_info, - .close_conduit = NULL -}; - -/* Local variables */ -orte_rml_ofi_module_t orte_rml_ofi = { - .api = { - .component = (struct orte_rml_component_t*)&mca_rml_ofi_component, - .ping = NULL, - .send_nb = orte_rml_ofi_send_nb, - .send_buffer_nb = orte_rml_ofi_send_buffer_nb, - .purge = NULL - } -}; - -/* Local variables */ -static bool init_done = false; - -static int -rml_ofi_component_open(void) -{ - /* Initialise endpoint and all queues */ - - orte_rml_ofi.fi_info_list = NULL; - orte_rml_ofi.min_ofi_recv_buf_sz = MIN_MULTI_BUF_SIZE; - orte_rml_ofi.cur_msgid = 1; - orte_rml_ofi.cur_transport_id = RML_OFI_PROV_ID_INVALID; - orte_rml_ofi.ofi_prov_open_num = 0; - OBJ_CONSTRUCT(&orte_rml_ofi.peers, opal_hash_table_t); - opal_hash_table_init(&orte_rml_ofi.peers, 128); - - for( uint8_t ofi_prov_id=0; ofi_prov_id < MAX_OFI_PROVIDERS ; ofi_prov_id++) { - orte_rml_ofi.ofi_prov[ofi_prov_id].fabric = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].domain = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].av = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].cq = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ep = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ep_name[0] = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf_size = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].progress_ev_active = false; - orte_rml_ofi.ofi_prov[ofi_prov_id].ofi_prov_id = RML_OFI_PROV_ID_INVALID; - } - - opal_output_verbose(10,orte_rml_base_framework.framework_output," from %s:%d rml_ofi_component_open()",__FILE__,__LINE__); - - return ORTE_SUCCESS; -} - - -void free_ofi_prov_resources( int ofi_prov_id) -{ - - int ret=0; - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - free_ofi_prov_resources() begin. OFI ofi_prov_id- %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ofi_prov_id); - if (orte_rml_ofi.ofi_prov[ofi_prov_id].ep) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close ep",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].ep); - if (ret) - { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - fi_close(ep) failed with error- %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ret); - } - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close mr_multi_recv",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].cq) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close cq",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].cq); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].av) { - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].av); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].domain) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close domain",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - CLOSE_FID(orte_rml_ofi.ofi_prov[ofi_prov_id].domain); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].fabric) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - close fabric",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - fi_close((fid_t)orte_rml_ofi.ofi_prov[ofi_prov_id].fabric); - } - if (orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf) { - free(orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf); - } - - orte_rml_ofi.ofi_prov[ofi_prov_id].fabric = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].domain = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].av = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].cq = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ep = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ep_name[0] = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].rxbuf_size = 0; - orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv = NULL; - orte_rml_ofi.ofi_prov[ofi_prov_id].ofi_prov_id = RML_OFI_PROV_ID_INVALID; - - - if( orte_rml_ofi.ofi_prov[ofi_prov_id].progress_ev_active) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - deleting progress event", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_event_del( &orte_rml_ofi.ofi_prov[ofi_prov_id].progress_event); - } - - return; -} - - -static int -rml_ofi_component_close(void) -{ - - int rc; - opal_object_t *value; - uint64_t key; - void *node; - uint8_t ofi_prov_id; - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - rml_ofi_component_close() -begin, total open OFI providers = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),orte_rml_ofi.ofi_prov_open_num); - - if (orte_rml_ofi.fi_info_list) { - (void) fi_freeinfo(orte_rml_ofi.fi_info_list); - } - - /* Close endpoint and all queues */ - for (ofi_prov_id=0; ofi_prov_id < orte_rml_ofi.ofi_prov_open_num; ofi_prov_id++) { - free_ofi_prov_resources(ofi_prov_id); - } - - /* release all peers from the hash table */ - rc = opal_hash_table_get_first_key_uint64(&orte_rml_ofi.peers, &key, - (void **)&value, &node); - while (OPAL_SUCCESS == rc) { - if (NULL != value) { - OBJ_RELEASE(value); - } - rc = opal_hash_table_get_next_key_uint64 (&orte_rml_ofi.peers, &key, - (void **) &value, node, &node); - } - OBJ_DESTRUCT(&orte_rml_ofi.peers); - OPAL_LIST_DESTRUCT(&orte_rml_ofi.recv_msg_queue_list); - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - rml_ofi_component_close() end",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return ORTE_SUCCESS; -} - -void print_provider_info (struct fi_info *cur_fi ) -{ - //Display all the details in the fi_info structure - opal_output_verbose(1,orte_rml_base_framework.framework_output, - " %s - Print_provider_info() ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " Provider name : %s",cur_fi->fabric_attr->prov_name); - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " Protocol : %s",fi_tostr(&cur_fi->ep_attr->protocol,FI_TYPE_PROTOCOL)); - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " EP Type : %s",fi_tostr(&cur_fi->ep_attr->type,FI_TYPE_EP_TYPE)); - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " address_format : %s",fi_tostr(&cur_fi->addr_format,FI_TYPE_ADDR_FORMAT)); -} - -void print_provider_list_info (struct fi_info *fi ) -{ - struct fi_info *cur_fi = fi; - int fi_count = 0; - //Display all the details in the fi_info structure - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %s - Print_provider_list_info() ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - while( NULL != cur_fi ) { - fi_count++; - opal_output_verbose(10,orte_rml_base_framework.framework_output, - " %d.\n",fi_count); - print_provider_info( cur_fi); - cur_fi = cur_fi->next; - } - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "Total # of providers supported is %d\n",fi_count); -} - -/* - * This returns all the supported transports in the system that support endpoint type RDM (reliable datagram) - * The providers returned is a list of type opal_valut_t holding opal_list_t - */ -static orte_rml_pathway_t* query_transports(void) -{ - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d OFI Query Interface not implemented",__FILE__,__LINE__); - return NULL; -} - - -/** - ofi_prov [in]: the ofi ofi_prov_id that triggered the progress fn - **/ -__opal_attribute_always_inline__ static inline int -orte_rml_ofi_progress(ofi_transport_ofi_prov_t* prov) -{ - ssize_t ret; - int count=0; /* number of messages read and processed */ - struct fi_cq_data_entry wc = { 0 }; - struct fi_cq_err_entry error = { 0 }; - orte_rml_ofi_request_t *ofi_req; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s orte_rml_ofi_progress called for OFI ofi_provid %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - /** - * Read the work completions from the CQ. - * From the completion's op_context, we get the associated OFI request. - * Call the request's callback. - */ - while (true) { - /* Read the cq - that triggered the libevent to call this progress fn. */ - ret = fi_cq_read(prov->cq, (void *)&wc, 1); - if (0 < ret) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s cq read for OFI ofi_provid %d - wc.flags = %llx", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id, (long long unsigned int)wc.flags); - count++; - // check the flags to see if this is a send-completion or receive - if ( wc.flags & FI_SEND ) - { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Send completion received on OFI provider id %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - if (NULL != wc.op_context) { - /* get the context from the wc and call the message handler */ - ofi_req = TO_OFI_REQ(wc.op_context); - assert(ofi_req); - ret = orte_rml_ofi_send_callback(&wc, ofi_req); - if (ORTE_SUCCESS != ret) { - opal_output(orte_rml_base_framework.framework_output, - "Error returned by OFI send callback handler when a send completion was received on OFI prov: %zd", - ret); - } - } - } else if ( (wc.flags & FI_RECV) && (wc.flags & FI_MULTI_RECV) ) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Received message on OFI ofi_prov_id %d - but buffer is consumed, need to repost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - // reposting buffer - ret = fi_recv(orte_rml_ofi.ofi_prov[prov->ofi_prov_id].ep, - orte_rml_ofi.ofi_prov[prov->ofi_prov_id].rxbuf, - orte_rml_ofi.ofi_prov[prov->ofi_prov_id].rxbuf_size, - fi_mr_desc(orte_rml_ofi.ofi_prov[prov->ofi_prov_id].mr_multi_recv), - 0,&(prov->rx_ctx1)); - // call the receive message handler that will call the rml_base - ret = orte_rml_ofi_recv_handler(&wc, prov->ofi_prov_id); - if (ORTE_SUCCESS != ret) { - opal_output(orte_rml_base_framework.framework_output, - "Error returned by OFI Recv handler when handling the received message on the prov: %zd", - ret); - } - } else if ( wc.flags & FI_RECV ) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Received message on OFI provider id %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - // call the receive message handler that will call the rml_base - ret = orte_rml_ofi_recv_handler(&wc, prov->ofi_prov_id); - if (ORTE_SUCCESS != ret) { - opal_output(orte_rml_base_framework.framework_output, - "Error returned by OFI Recv handler when handling the received message on the OFI prov: %zd", - ret); - } - } else if ( wc.flags & FI_MULTI_RECV ) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Received buffer overrun message on OFI provider id %d - need to repost", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - // reposting buffer - ret = fi_recv(orte_rml_ofi.ofi_prov[prov->ofi_prov_id].ep, - orte_rml_ofi.ofi_prov[prov->ofi_prov_id].rxbuf, - orte_rml_ofi.ofi_prov[prov->ofi_prov_id].rxbuf_size, - fi_mr_desc(orte_rml_ofi.ofi_prov[prov->ofi_prov_id].mr_multi_recv), - 0,&(prov->rx_ctx1)); - if (ORTE_SUCCESS != ret) { - opal_output(orte_rml_base_framework.framework_output, - "Error returned by OFI when reposting buffer on the OFI prov: %zd", - ret); - } - }else { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "CQ has unhandled completion event with FLAG wc.flags = 0x%llx", - (long long unsigned int)wc.flags); - } - } else if (ret == -FI_EAVAIL) { - /** - * An error occured and is being reported via the CQ. - * Read the error and forward it to the upper layer. - */ - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s cq_read for OFI provider id %d returned error 0x%zx <%s>", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id, ret, - fi_strerror((int) -ret) ); - ret = fi_cq_readerr(prov->cq,&error,0); - if (0 > ret) { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "Error returned from fi_cq_readerr: %zd", ret); - } - assert(error.op_context); - /* get the context from wc and call the error handler */ - ofi_req = TO_OFI_REQ(error.op_context); - assert(ofi_req); - ret = orte_rml_ofi_error_callback(&error, ofi_req); - if (ORTE_SUCCESS != ret) { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "Error returned by request error callback: %zd", - ret); - } - break; - } else if (ret == -FI_EAGAIN){ - /** - * The CQ is empty. Return. - */ - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Empty cq for OFI provider id %d,exiting from ofi_progress()", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id ); - break; - } else { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s cq_read for OFI provider id %d returned error 0x%zx <%s>", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id, ret, - fi_strerror((int) -ret) ); - break; - } - } - return count; -} - - -/* - * call the ofi_progress() fn to read the cq - * - */ -int cq_progress_handler(int sd, short flags, void *cbdata) -{ - ofi_transport_ofi_prov_t* prov = (ofi_transport_ofi_prov_t*)cbdata; - int count; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s cq_progress_handler called for OFI Provider id %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - prov->ofi_prov_id); - - /* call the progress fn to read the cq and process the message - * for the ofi provider */ - count = orte_rml_ofi_progress(prov); - return count; -} - - -/* - * Returns the number of ofi-providers available - */ -static int rml_ofi_component_init(void) -{ - int ret, fi_version; - struct fi_info *hints, *fabric_info; - struct fi_cq_attr cq_attr = {0}; - struct fi_av_attr av_attr = {0}; - char *pmix_key; - uint8_t cur_ofi_prov; - - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s - Entering rml_ofi_component_init()",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - - if (init_done) { - return orte_rml_ofi.ofi_prov_open_num; - } - - - /** - * Hints to filter providers - * See man fi_getinfo for a list of all filters - * mode: Select capabilities MTL is prepared to support. - * In this case, MTL will pass in context into communication calls - * ep_type: reliable datagram operation - * caps: Capabilities required from the provider. - * Tag matching is specified to implement MPI semantics. - * msg_order: Guarantee that messages with same tag are ordered. - */ - - hints = fi_allocinfo(); - if (!hints) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: Could not allocate fi_info\n", - __FILE__, __LINE__); - return orte_rml_ofi.ofi_prov_open_num; - } - - /** - * Refine filter for additional capabilities - * endpoint type : Reliable datagram - * threading: Disable locking - * control_progress: enable async progress - */ - hints->mode = FI_CONTEXT; - hints->ep_attr->type = FI_EP_RDM; /* Reliable datagram */ - - hints->domain_attr->threading = FI_THREAD_UNSPEC; - hints->domain_attr->control_progress = FI_PROGRESS_AUTO; - hints->domain_attr->data_progress = FI_PROGRESS_AUTO; - hints->domain_attr->av_type = FI_AV_MAP; - - /** - * FI_VERSION provides binary backward and forward compatibility support - * Specify the version of OFI is coded to, the provider will select struct - * layouts that are compatible with this version. - */ - fi_version = FI_VERSION(1, 3); - - /** - * fi_getinfo: returns information about fabric services for reaching a - * remote node or service. this does not necessarily allocate resources. - * Pass NULL for name/service because we want a list of providers supported. - */ - ret = fi_getinfo(fi_version, /* OFI version requested */ - NULL, /* Optional name or fabric to resolve */ - NULL, /* Optional service name or port to request */ - 0ULL, /* Optional flag */ - hints, /* In: Hints to filter providers */ - &orte_rml_ofi.fi_info_list); /* Out: List of matching providers */ - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_getinfo failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - } else { - - /* added for debug purpose - Print the provider info - print_transports_query(); - print_provider_list_info(orte_rml_ofi.fi_info_list); - */ - - /** create the OFI objects for each transport in the system - * (fi_info_list) and store it in the ofi_prov array **/ - orte_rml_ofi.ofi_prov_open_num = 0; // start the ofi_prov_id from 0 - for( fabric_info = orte_rml_ofi.fi_info_list ; - NULL != fabric_info && orte_rml_ofi.ofi_prov_open_num < MAX_OFI_PROVIDERS ; fabric_info = fabric_info->next) - { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d beginning to add endpoint for OFI_provider_id=%d ",__FILE__,__LINE__,orte_rml_ofi.ofi_prov_open_num); - print_provider_info(fabric_info); - cur_ofi_prov = orte_rml_ofi.ofi_prov_open_num; - orte_rml_ofi.ofi_prov[cur_ofi_prov].ofi_prov_id = orte_rml_ofi.ofi_prov_open_num ; - orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info = fabric_info; - - // set FI_MULTI_RECV flag for all recv operations - fabric_info->rx_attr->op_flags = FI_MULTI_RECV; - /** - * Open fabric - * The getinfo struct returns a fabric attribute struct that can be used to - * instantiate the virtual or physical network. This opens a "fabric - * provider". See man fi_fabric for details. - */ - - ret = fi_fabric(fabric_info->fabric_attr, /* In: Fabric attributes */ - &orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric, /* Out: Fabric handle */ - NULL); /* Optional context for fabric events */ - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_fabric failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric = NULL; - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - - /** - * Create the access domain, which is the physical or virtual network or - * hardware port/collection of ports. Returns a domain object that can be - * used to create endpoints. See man fi_domain for details. - */ - ret = fi_domain(orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric, /* In: Fabric object */ - fabric_info, /* In: Provider */ - &orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, /* Out: Domain oject */ - NULL); /* Optional context for domain events */ - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_domain failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - orte_rml_ofi.ofi_prov[cur_ofi_prov].domain = NULL; - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * Create a transport level communication endpoint. To use the endpoint, - * it must be bound to completion counters or event queues and enabled, - * and the resources consumed by it, such as address vectors, counters, - * completion queues, etc. - * see man fi_endpoint for more details. - */ - ret = fi_endpoint(orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, /* In: Domain object */ - fabric_info, /* In: Provider */ - &orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, /* Out: Endpoint object */ - NULL); /* Optional context */ - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_endpoint failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * Save the maximum inject size. - */ - //orte_rml_ofi.max_inject_size = prov->tx_attr->inject_size; - - /** - * Create the objects that will be bound to the endpoint. - * The objects include: - * - completion queue for events - * - address vector of other endpoint addresses - * - dynamic memory-spanning memory region - */ - cq_attr.format = FI_CQ_FORMAT_DATA; - cq_attr.wait_obj = FI_WAIT_FD; - cq_attr.wait_cond = FI_CQ_COND_NONE; - ret = fi_cq_open(orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, - &cq_attr, &orte_rml_ofi.ofi_prov[cur_ofi_prov].cq, NULL); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_cq_open failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * The remote fi_addr will be stored in the ofi_endpoint struct. - * So, we use the AV in "map" mode. - */ - av_attr.type = FI_AV_MAP; - ret = fi_av_open(orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, - &av_attr, &orte_rml_ofi.ofi_prov[cur_ofi_prov].av, NULL); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_av_open failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * Bind the CQ and AV to the endpoint object. - */ - ret = fi_ep_bind(orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, - (fid_t)orte_rml_ofi.ofi_prov[cur_ofi_prov].cq, - FI_SEND | FI_RECV); - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_bind CQ-EP failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - ret = fi_ep_bind(orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, - (fid_t)orte_rml_ofi.ofi_prov[cur_ofi_prov].av, - 0); - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_bind AV-EP failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /** - * Enable the endpoint for communication - * This commits the bind operations. - */ - ret = fi_enable(orte_rml_ofi.ofi_prov[cur_ofi_prov].ep); - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_enable failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d ep enabled for ofi_prov_id - %d ",__FILE__,__LINE__,orte_rml_ofi.ofi_prov[cur_ofi_prov].ofi_prov_id); - - - /** - * Get our address and publish it with modex. - **/ - orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen = sizeof (orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name); - ret = fi_getname((fid_t)orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, - &orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name[0], - &orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_getname failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /* Register the ofi address of this peer with PMIX server only if it is a user process / - * for daemons the set/get_contact_info is used to exchange this information */ - if (ORTE_PROC_IS_APP) { - asprintf(&pmix_key,"%s%d",orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info->fabric_attr->prov_name,cur_ofi_prov); - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s calling OPAL_MODEX_SEND_STRING for key - %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), pmix_key ); - OPAL_MODEX_SEND_STRING( ret, OPAL_PMIX_GLOBAL, - pmix_key, - orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name, - orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen); - /*print debug information on opal_modex_string */ - switch ( orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info->addr_format) - { - case FI_SOCKADDR_IN : - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s:%d In FI_SOCKADDR_IN. ",__FILE__,__LINE__); - /* Address is of type sockaddr_in (IPv4) */ - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s sending Opal modex string for ofi prov_id %d, epnamelen = %lu ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),cur_ofi_prov,orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen); - /*[debug] - print the sockaddr - port and s_addr */ - struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name; - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s port = 0x%x, InternetAddr = 0x%s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr)); - break; - } - /* end of printing opal_modex_string and port, IP */ - free(pmix_key); - if (ORTE_SUCCESS != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: OPAL_MODEX_SEND failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /*abort this current transport, but check if next transport can be opened*/ - continue; - } - } - - /** - * Set the ANY_SRC address. - */ - orte_rml_ofi.any_addr = FI_ADDR_UNSPEC; - - /** - * Allocate tx,rx buffers and Post a multi-RECV buffer for each endpoint - **/ - //[TODO later] For now not considering ep_attr prefix_size (add this later) - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf_size = MIN_MULTI_BUF_SIZE * MULTI_BUF_SIZE_FACTOR; - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf = malloc(orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf_size); - - ret = fi_mr_reg(orte_rml_ofi.ofi_prov[cur_ofi_prov].domain, - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf, - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf_size, - FI_RECV, 0, 0, 0, &orte_rml_ofi.ofi_prov[cur_ofi_prov].mr_multi_recv, - &orte_rml_ofi.ofi_prov[cur_ofi_prov].rx_ctx1); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_mr_reg failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - ret = fi_setopt(&orte_rml_ofi.ofi_prov[cur_ofi_prov].ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, - &orte_rml_ofi.min_ofi_recv_buf_sz, sizeof(orte_rml_ofi.min_ofi_recv_buf_sz) ); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_setopt failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - ret = fi_recv(orte_rml_ofi.ofi_prov[cur_ofi_prov].ep, - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf, - orte_rml_ofi.ofi_prov[cur_ofi_prov].rxbuf_size, - fi_mr_desc(orte_rml_ofi.ofi_prov[cur_ofi_prov].mr_multi_recv), - 0,&orte_rml_ofi.ofi_prov[cur_ofi_prov].rx_ctx1); - if (ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_recv failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - /** - * get the fd and register the progress fn - **/ - ret = fi_control(&orte_rml_ofi.ofi_prov[cur_ofi_prov].cq->fid, FI_GETWAIT, - (void *) &orte_rml_ofi.ofi_prov[cur_ofi_prov].fd); - if (0 != ret) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s:%d: fi_control failed to get fd: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); - free_ofi_prov_resources(cur_ofi_prov); - /* abort this current transport, but check if next transport can be opened */ - continue; - } - - /* - create the event that will wait on the fd*/ - /* use the opal_event_set to do a libevent set on the fd - * so when something is available to read, the cq_porgress_handler - * will be called */ - opal_event_set(orte_event_base, - &orte_rml_ofi.ofi_prov[cur_ofi_prov].progress_event, - orte_rml_ofi.ofi_prov[cur_ofi_prov].fd, - OPAL_EV_READ|OPAL_EV_PERSIST, - cq_progress_handler, - &orte_rml_ofi.ofi_prov[cur_ofi_prov]); - opal_event_add(&orte_rml_ofi.ofi_prov[cur_ofi_prov].progress_event, 0); - orte_rml_ofi.ofi_prov[cur_ofi_prov].progress_ev_active = true; - - /** update the number of ofi_provs in the ofi_prov[] array **/ - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d ofi_prov id - %d created ",__FILE__,__LINE__,orte_rml_ofi.ofi_prov_open_num); - orte_rml_ofi.ofi_prov_open_num++; - } - if (fabric_info != NULL && orte_rml_ofi.ofi_prov_open_num >= MAX_OFI_PROVIDERS ) { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s:%d fi_getinfo list not fully parsed as MAX_OFI_PROVIDERS - %d reached ",__FILE__,__LINE__,orte_rml_ofi.ofi_prov_open_num); - } - - - } - /** - * Free providers info since it's not needed anymore. - */ - fi_freeinfo(hints); - hints = NULL; - /* check if atleast one ofi_prov was successfully opened */ - if (0 < orte_rml_ofi.ofi_prov_open_num ) { - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s:%d ofi providers openened=%d returning orte_rml_ofi.api", - __FILE__,__LINE__,orte_rml_ofi.ofi_prov_open_num); - - OBJ_CONSTRUCT(&orte_rml_ofi.recv_msg_queue_list,opal_list_t); - } else { - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s:%d Failed to open any OFI Providers",__FILE__,__LINE__); - } - - return orte_rml_ofi.ofi_prov_open_num; -} - -/* return : the ofi_prov_id that corresponds to the transport requested by the attributes - if transport is not found RML_OFI_PROV_ID_INVALID is returned. - @[in]attributes : the attributes passed in to open_conduit reg the transport requested -*/ -int get_ofi_prov_id( opal_list_t *attributes) -{ - - int ofi_prov_id = RML_OFI_PROV_ID_INVALID, prov_num=0; - char *provider = NULL, *transport = NULL; - char *ethernet="sockets", *fabric="psm2"; - struct fi_info *cur_fi; - - /* check the list of attributes to see if we should respond - * Attribute should have ORTE_RML_TRANSPORT_ATTRIB key - * with values "ethernet" or "fabric" - * (or) ORTE_RML_OFI_PROV_NAME key with values "socket" or "OPA" - * if both above attributes are missing return failure - */ - if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_ATTRIB, (void**)&transport, OPAL_STRING) ) { - if( 0 == strcmp( transport, "ethernet") ) { - provider = ethernet; - } else if ( 0 == strcmp( transport, "fabric") ) { - provider = fabric; - } - } - /* if from the transport we don't know which provider we want, then check for the ORTE_RML_OFI_PROV_NAME_ATTRIB */ - if ( NULL == provider) { - orte_get_attribute(attributes, ORTE_RML_PROVIDER_ATTRIB, (void**)&provider, OPAL_STRING); - } - if (NULL != provider) - { - // loop the orte_rml_ofi.ofi_provs[] and find the provider name that matches - for ( prov_num = 0; prov_num < orte_rml_ofi.ofi_prov_open_num && ofi_prov_id == RML_OFI_PROV_ID_INVALID ; prov_num++ ) { - cur_fi = orte_rml_ofi.ofi_prov[prov_num].fabric_info; - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - get_ofi_prov_id() -> comparing %s = %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),provider,cur_fi->fabric_attr->prov_name); - if ( strcmp(provider,cur_fi->fabric_attr->prov_name) == 0) { - ofi_prov_id = prov_num; - } - } - - } - - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - get_ofi_prov_id(), returning ofi_prov_id=%d ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ofi_prov_id); - return ofi_prov_id; -} - -/* - * Allocate a new module and initialise ofi_prov information - * for the requested provider and return the module * - */ -static orte_rml_base_module_t* make_module( int ofi_prov_id) -{ - orte_rml_ofi_module_t *mod = NULL; - - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - rml_ofi make_module() begin ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - if ( RML_OFI_PROV_ID_INVALID == ofi_prov_id) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - open_conduit did not select any ofi provider, returning NULL ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return NULL; - } - - - /* create a new module */ - mod = (orte_rml_ofi_module_t*)calloc(1,sizeof(orte_rml_ofi_module_t)); - if (NULL == mod) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - Module allocation failed, returning NULL ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return NULL; - } - /* copy the APIs over to it and the OFI provider information */ - memcpy(mod, &orte_rml_ofi, sizeof(orte_rml_ofi_module_t)); - /* setup the remaining data locations in mod, associate conduit with ofi provider selected*/ - mod->cur_transport_id = ofi_prov_id; - - return (orte_rml_base_module_t*)mod; -} - - -/* Order of attributes honoring * -* ORTE_RML_INCLUDE_COMP_ATTRIB * -* ORTE_RML_EXCLUDE_COMP_ATTRIB * -* ORTE_RML_TRANSPORT_ATTRIB * -* ORTE_RML_PROVIDER_ATTRIB */ -static orte_rml_base_module_t* open_conduit(opal_list_t *attributes) -{ - char *comp_attrib = NULL; - char **comps; - int i; - orte_attribute_t *attr; - opal_list_t provider; - - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - Entering rml_ofi_open_conduit()", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - - /* Open all ofi endpoints */ - if (!init_done) { - rml_ofi_component_init(); - init_done = true; - } - - /* check if atleast 1 ofi provider is initialised */ - if ( 0 >= orte_rml_ofi.ofi_prov_open_num) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - Init did not open any Ofi endpoints, returning NULL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return NULL; - } - - - /* someone may require this specific component, so look for "ofi" */ - if (orte_get_attribute(attributes, ORTE_RML_INCLUDE_COMP_ATTRIB, (void**)&comp_attrib, OPAL_STRING) && - NULL != comp_attrib) { - /* they specified specific components - could be multiple */ - comps = opal_argv_split(comp_attrib, ','); - for (i=0; NULL != comps[i]; i++) { - if (0 == strcmp(comps[i], "ofi")) { - /* we are a candidate, */ - opal_argv_free(comps); - return make_module(get_ofi_prov_id(attributes)); - } - } - /* we are not a candidate */ - opal_argv_free(comps); - return NULL; - } else if (orte_get_attribute(attributes, ORTE_RML_EXCLUDE_COMP_ATTRIB, (void**)&comp_attrib, OPAL_STRING) && - NULL != comp_attrib) { - /* see if we are on the list */ - comps = opal_argv_split(comp_attrib, ','); - for (i=0; NULL != comps[i]; i++) { - if (0 == strcmp(comps[i], "ofi")) { - /* we cannot be a candidate */ - opal_argv_free(comps); - return NULL; - } - } - } - - /*[Debug] to check for daemon commn over ofi-ethernet, enable the default conduit ORTE_MGMT_CONDUIT over ofi */ - if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) && - NULL != comp_attrib) { - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - Forcibly returning ofi socket provider for ethernet transport request", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - comps = opal_argv_split(comp_attrib, ','); - for (i=0; NULL != comps[i]; i++) { - if (0 == strcmp(comps[i], "ethernet")) { - /* we are a candidate, */ - opal_argv_free(comps); - OBJ_CONSTRUCT(&provider, opal_list_t); - orte_set_attribute(&provider, ORTE_RML_PROVIDER_ATTRIB, - ORTE_ATTR_LOCAL, "sockets", OPAL_STRING); - return make_module(get_ofi_prov_id(&provider)); - } - } - opal_argv_free(comps); - } - /*[Debug] */ - - /* Alternatively, check the attributes to see if we qualify - we only handle - * "pt2pt" */ - OPAL_LIST_FOREACH(attr, attributes, orte_attribute_t) { - /* [TODO] add any additional attributes check here */ - - } - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - ofi is not a candidate as per attributes, returning NULL", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* if we get here, we cannot handle it */ - return NULL; -} - -static void pr_cons(orte_rml_ofi_peer_t *ptr) -{ - ptr->ofi_ep = NULL; - ptr->ofi_ep_len = 0; -} - -static void pr_des(orte_rml_ofi_peer_t *ptr) -{ - if ( 0 < ptr->ofi_ep_len) - free( ptr->ofi_ep); -} - -OBJ_CLASS_INSTANCE(orte_rml_ofi_peer_t, - opal_object_t, - pr_cons, pr_des); - - -/* The returned string will be of format - */ -/* ";ofi-socket:;ofi-:" */ -/* caller will take care of string length check to not exceed limit */ -static char* ofi_get_contact_info(void) -{ - char *turi, *final=NULL, *tmp, *addrtype; - int rc=ORTE_SUCCESS, cur_ofi_prov=0; - struct sockaddr_in* ep_sockaddr; - - /* start with our process name */ - if (ORTE_SUCCESS != (rc = orte_util_convert_process_name_to_string(&final, ORTE_PROC_MY_NAME))) { - /* [TODO] ORTE_ERROR_LOG(rc); */ - return final; - } - - /* The returned string will be of format - ";ofi-addr:;" */ - /* we are sending only the ethernet address */ - for( cur_ofi_prov=0; cur_ofi_prov < orte_rml_ofi.ofi_prov_open_num ; cur_ofi_prov++ ) { - if ( FI_SOCKADDR_IN == orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info->addr_format) { - ep_sockaddr = (struct sockaddr_in*)orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name; - asprintf(&addrtype, OFIADDR); - asprintf(&turi,"%d,%s,%d",ep_sockaddr->sin_family,inet_ntoa(ep_sockaddr->sin_addr),ntohs(ep_sockaddr->sin_port)); - opal_output_verbose(20,orte_rml_base_framework.framework_output, - "%s - cur_ofi_prov = %d, addrtype = %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),cur_ofi_prov,addrtype); - /* Add to the final string - the ofi addrtype and the epname */ - asprintf(&tmp, "%s;%s:%s", final,addrtype, turi); - - free(addrtype); - free(turi); - free(final); - final = tmp; - } - } - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "[%s] get_contact_info returns string - %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),final); - return final; -} - - -static void ofi_set_contact_info (const char *uri) -{ - char *uris; - - opal_output_verbose(5, orte_rml_base_framework.framework_output, - "%s: OFI set_contact_info to uri %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == uri) ? "NULL" : uri); - - /* if the request doesn't contain a URI, then we - * have an error - */ - if (NULL == uri) { - opal_output(0, "%s: NULL URI", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* [TODO] ORTE_FORCED_TERMINATE(1);*/ - return; - } - - uris = strdup(uri); - process_uri(uris); - free(uris); - return; -} - -static void process_uri( char *uri) -{ - orte_process_name_t peer; - char *cptr, *ofiuri; - char **uris=NULL; - int rc, i=0, tot_reqd = 1, tot_found = 0; - uint64_t ui64; - orte_rml_ofi_peer_t *pr; - struct sockaddr_in* ep_sockaddr; - - /* find the first semi-colon in the string */ - cptr = strchr(uri, ';'); - if (NULL == cptr) { - /* got a problem - there must be at least two fields, - * the first containing the process name of our peer - * and all others containing the OOB contact info - */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return; - } - *cptr = '\0'; - cptr++; - - /* the first field is the process name, so convert it */ - orte_util_convert_string_to_process_name(&peer, uri); - - /* if the peer is us, no need to go further as we already - * know our own contact info - */ - if (peer.jobid == ORTE_PROC_MY_NAME->jobid && - peer.vpid == ORTE_PROC_MY_NAME->vpid) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s:OFI set_contact_info peer %s is me", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer)); - //skip adding to hashtable for HNP - if (!ORTE_PROC_IS_HNP) { - return; - } else { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s:OFI set_contact_info - HNP process so proceeding to add to hashtable", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - } - } - - /* split the rest of the uri into component parts */ - uris = opal_argv_split(cptr, ';'); - - /* get the peer object for this process */ - memcpy(&ui64, (char*)&peer, sizeof(uint64_t)); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_rml_ofi.peers, - ui64, (void**)&pr) || - NULL == pr) { - pr = OBJ_NEW(orte_rml_ofi_peer_t); - /* populate the peer object with the ofi addresses */ - for(i=0; NULL != uris[i] && tot_found < tot_reqd; i++) { - ofiuri = strdup(uris[i]); - if (NULL == ofiuri) { - opal_output_verbose(2, orte_rml_base_framework.framework_output, - "%s rml:ofi: out of memory", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - continue; - } - /* Handle the OFI address types in the uri - OFIADDR(ofiaddr) */ - if (0 == strncmp(ofiuri, OFIADDR, strlen(OFIADDR)) ) { - /* allocate and initialise the peer object to be inserted in hashtable */ - pr->ofi_ep_len = sizeof(struct sockaddr_in); - ep_sockaddr = malloc( sizeof ( struct sockaddr_in) ); - /* ofiuri for socket provider is of format - ofi-socket: */ - convert_to_sockaddr(ofiuri, ep_sockaddr); - pr->ofi_ep = (void *)ep_sockaddr; - tot_found++; - } - free( ofiuri); - } - /* if atleast one OFI address is known for peer insert it */ - if( 1 <= tot_found ) { - if (OPAL_SUCCESS != - (rc = opal_hash_table_set_value_uint64(&orte_rml_ofi.peers, ui64, (void*)pr))) { - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s: ofi peer address insertion failed for peer %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer)); - ORTE_ERROR_LOG(rc); - } - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s: ofi peer address inserted for peer %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&peer)); - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s: ofi sock address length = %zd ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - pr->ofi_ep_len); - struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)pr->ofi_ep; - opal_output_verbose(15,orte_rml_base_framework.framework_output, - "%s OFI set_name() port = 0x%x, InternetAddr = %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr)); - } - } - opal_output_verbose(10,orte_rml_base_framework.framework_output, - "%s OFI end of set_contact_info()", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - opal_argv_free(uris); - return; -} - - -/* converts the socket uri returned by get_contact_info into sockaddr_in */ -void convert_to_sockaddr( char *ofiuri, struct sockaddr_in* ep_sockaddr) -{ - char *tmp, *sin_fly, *sin_port, *sin_addr; - short port; - - tmp = strchr(ofiuri,':'); - sin_fly = tmp+1; - tmp = strchr(sin_fly,','); - sin_addr = tmp+1; - *tmp = '\0'; - tmp = strchr(sin_addr,','); - sin_port = tmp + 1; - *tmp = '\0'; - - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s OFI convert_to_sockaddr uri strings got -> family = %s, InternetAddr = %s, port = %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),sin_fly,sin_addr, sin_port); - ep_sockaddr->sin_family = atoi( sin_fly ); - port = atoi( sin_port); - ep_sockaddr->sin_port = htons(port); - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s OFI convert_to_sockaddr() port = 0x%x decimal-%d, InternetAddr = %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ntohs(ep_sockaddr->sin_port),ntohs(ep_sockaddr->sin_port), - inet_ntoa(ep_sockaddr->sin_addr)); -} diff --git a/orte/mca/rml/ofi/rml_ofi_request.h b/orte/mca/rml/ofi/rml_ofi_request.h deleted file mode 100644 index 54b8203ae84..00000000000 --- a/orte/mca/rml/ofi/rml_ofi_request.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2015 Intel, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef ORTE_RML_OFI_REQUEST_H -#define ORTE_RML_OFI_REQUEST_H - - -#define TO_OFI_REQ(_ptr_ctx) \ - container_of((_ptr_ctx), orte_rml_ofi_request_t, ctx) - -typedef enum { - ORTE_RML_OFI_SEND, - ORTE_RML_OFI_RECV, - ORTE_RML_OFI_ACK, - ORTE_RML_OFI_PROBE -} orte_rml_ofi_request_type_t; -/* orte_rml_ofi_msg_header_t contains the header information for the message being sent. -The header and data is passed on to the destination. The destination will re-construct the -orte_rml_sent_t struct once it receives this header and data.This header has the required information -to construct the orte_rml_sent_t struct and also if the message is split into packets, -then the packet information - total number of packets and the current packet number. -*/ -struct orte_rml_ofi_msg_header_t{ - opal_process_name_t origin; // originator process id from the send message - opal_process_name_t dst; // Destination process id from the send message - uint32_t seq_num; // seq_num from the send message - orte_rml_tag_t tag; // tag from the send message - uint32_t msgid; // unique msgid added by ofi plugin to keep track of fragmented msgs - uint32_t tot_pkts; // total packets this msg will be fragmented into by ofi plugin - uint32_t cur_pkt_num; // current packet number - }; -typedef struct orte_rml_ofi_msg_header_t orte_rml_ofi_msg_header_t; - -/* -orte_rml_ofi_pkts_t defines the packets in the message. Each packet contains header information -and the data. Create a list of packets to hold the entire message. -*/ -typedef struct { - //list_item_t - opal_list_item_t super; - /* header + data size */ - size_t pkt_size; - //header + data - void *data; -}orte_rml_ofi_send_pkt_t; -OBJ_CLASS_DECLARATION(orte_rml_ofi_send_pkt_t); - -/* -orte_rml_ofi_recv_pkt_t defines the packets in the receiving end of message. -Each packet contains the packet number and the data. -Create a list of packets to hold the entire message. -*/ -typedef struct { - //list_item_t - opal_list_item_t super; - /* current packet number */ - uint32_t cur_pkt_num; - /*data size */ - size_t pkt_size; - //data - void *data; -}orte_rml_ofi_recv_pkt_t; -OBJ_CLASS_DECLARATION(orte_rml_ofi_recv_pkt_t); - -/* -orte_rml_ofi_request_t holds the send request (orte_rml_send_t) -*/ -typedef struct { - opal_object_t super; - - /** OFI context */ - struct fi_context ctx; - - orte_rml_send_t *send; - - /** OFI provider_id the request will use - this is - * the reference to element into the orte_rml_ofi.ofi_prov[] **/ - uint8_t ofi_prov_id; - - /** OFI Request type */ - orte_rml_ofi_request_type_t type; - - /** Completion count used by blocking and/or synchronous operations */ - volatile int completion_count; - - /** Reference to the RML used to lookup */ - /* source of an ANY_SOURCE Recv */ - struct orte_rml_base_module_t* rml; - - /** header being sent **/ - orte_rml_ofi_msg_header_t hdr; - - /** Pack buffer */ - void *data_blob; - - /** Pack buffer size */ - size_t length; - - /** Header and data in a list of Packets orte_rml_ofi_send_pkt_t */ - opal_list_t pkt_list; - -} orte_rml_ofi_request_t; -OBJ_CLASS_DECLARATION(orte_rml_ofi_request_t); - - -/* This will hold all the pckts received at the destination. -Each entry will be indexed by [sender,msgid] and will have -all the packets for that msgid and sender. -*/ -typedef struct { - - opal_list_item_t super; //list_item_t - uint32_t msgid; // unique msgid added by ofi plugin to keep track of fragmented msgs - opal_process_name_t sender; // originator process id from the send message - uint32_t tot_pkts; // total packets this msg will be fragmented into by ofi plugin - uint32_t pkt_recd; // current packet number - opal_list_t pkt_list; // list holding Packets in this msg of type orte_rml_ofi_recv_pkt_t -} ofi_recv_msg_queue_t; -OBJ_CLASS_DECLARATION( ofi_recv_msg_queue_t); - -/* define an object for transferring send requests to the event lib */ -typedef struct { - opal_object_t super; - opal_event_t ev; - orte_rml_send_t send; - /* ofi provider id */ - int ofi_prov_id; -} ofi_send_request_t; -OBJ_CLASS_DECLARATION(ofi_send_request_t); - -#endif diff --git a/orte/mca/rml/ofi/rml_ofi_send.c b/orte/mca/rml/ofi/rml_ofi_send.c deleted file mode 100644 index 718c13a017e..00000000000 --- a/orte/mca/rml/ofi/rml_ofi_send.c +++ /dev/null @@ -1,805 +0,0 @@ -/* - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" - -#include "opal/dss/dss_types.h" -#include "opal/util/output.h" -#include "opal/mca/event/event.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rml/base/base.h" -#include "orte/mca/rml/rml_types.h" - -#include -#include -#include -#include -#include -#include - -#include "rml_ofi.h" - - -static void ofi_req_cons(orte_rml_ofi_request_t *ptr) -{ - OBJ_CONSTRUCT(&ptr->pkt_list, opal_list_t); -} -static void ofi_req_des(orte_rml_ofi_request_t *ptr) -{ - OPAL_LIST_DESTRUCT(&ptr->pkt_list); -} -OBJ_CLASS_INSTANCE(orte_rml_ofi_request_t, - opal_object_t, - ofi_req_cons, ofi_req_des); - - -static void ofi_send_req_cons(ofi_send_request_t *ptr) -{ - OBJ_CONSTRUCT(&ptr->send, orte_rml_send_t); -} -OBJ_CLASS_INSTANCE(ofi_send_request_t, - opal_object_t, - ofi_send_req_cons, NULL); - -OBJ_CLASS_INSTANCE(orte_rml_ofi_send_pkt_t, - opal_list_item_t, - NULL, NULL); - -OBJ_CLASS_INSTANCE(orte_rml_ofi_recv_pkt_t, - opal_list_item_t, - NULL, NULL); - - -static void ofi_recv_msg_queue_cons(ofi_recv_msg_queue_t *ptr) -{ - ptr->msgid = 0; - ptr->tot_pkts = 1; - ptr->pkt_recd = 0; - OBJ_CONSTRUCT(&ptr->pkt_list, opal_list_t); -} -static void ofi_recv_msg_queue_des(ofi_recv_msg_queue_t *ptr) -{ - OPAL_LIST_DESTRUCT(&ptr->pkt_list); -} -OBJ_CLASS_INSTANCE(ofi_recv_msg_queue_t, - opal_list_item_t, - ofi_recv_msg_queue_cons, ofi_recv_msg_queue_des); - - -static void send_self_exe(int fd, short args, void* data) -{ - orte_self_send_xfer_t *xfer = (orte_self_send_xfer_t*)data; - - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml_send_to_self ofi callback executing for tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), xfer->tag); - - /* execute the send callback function - note that - * send-to-self always returns a SUCCESS status - */ - if (NULL != xfer->iov) { - if (NULL != xfer->cbfunc.iov) { - /* non-blocking iovec send */ - xfer->cbfunc.iov(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->iov, xfer->count, - xfer->tag, xfer->cbdata); - } - } else if (NULL != xfer->buffer) { - if (NULL != xfer->cbfunc.buffer) { - /* non-blocking buffer send */ - xfer->cbfunc.buffer(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->buffer, - xfer->tag, xfer->cbdata); - } - } else { - /* should never happen */ - abort(); - } - - /* cleanup the memory */ - OBJ_RELEASE(xfer); -} - -/** Send callback */ -/* [Desc] This is called from the progress fn when a send completion -** is received in the cq -** wc [in] : the completion queue data entry -** ofi_send_req [in]: ofi send request with the send msg and callback -*/ -int orte_rml_ofi_send_callback(struct fi_cq_data_entry *wc, - orte_rml_ofi_request_t* ofi_req) -{ - orte_rml_ofi_send_pkt_t *ofi_send_pkt, *next; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s orte_rml_ofi_send_callback called, completion count = %d, msgid = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_req->completion_count, ofi_req->hdr.msgid); - assert(ofi_req->completion_count > 0); - ofi_req->completion_count--; - if ( 0 == ofi_req->completion_count ) { - // call the callback fn of the sender - ofi_req->send->status = ORTE_SUCCESS; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s calling ORTE_RML_SEND_COMPLETE macro for msgid = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_req->hdr.msgid); - ORTE_RML_SEND_COMPLETE(ofi_req->send); - OPAL_LIST_FOREACH_SAFE(ofi_send_pkt, next, &ofi_req->pkt_list, orte_rml_ofi_send_pkt_t) { - free( ofi_send_pkt->data); - ofi_send_pkt->pkt_size=0; - opal_list_remove_item(&ofi_req->pkt_list, &ofi_send_pkt->super); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Removed pkt from list ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - OBJ_RELEASE(ofi_send_pkt); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Released packet ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - } - free(ofi_req->data_blob); - OBJ_RELEASE(ofi_req); - } - - // [TODO] need to check for error before returning success - return ORTE_SUCCESS; -} - -/** Error callback */ -/* [Desc] This is called from the progress fn when a send completion -** is received in the cq -** wc [in] : the completion queue data entry -** ofi_send_req [in]: ofi send request with the send msg and callback -*/ -int orte_rml_ofi_error_callback(struct fi_cq_err_entry *error, - orte_rml_ofi_request_t* ofi_req) -{ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s orte_rml_ofi_error_callback called ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - switch(error->err) { - default: - /* call the send-callback fn with error and return, also return failure status */ - ofi_req->send->status = ORTE_ERR_CONDUIT_SEND_FAIL; - ORTE_RML_SEND_COMPLETE(ofi_req->send); - } - return ORTE_SUCCESS; -} - -/** Recv handler */ -/* [Desc] This is called from the progress fn when a recv completion -** is received in the cq -** wc [in] : the completion queue data entry */ -int orte_rml_ofi_recv_handler(struct fi_cq_data_entry *wc, uint8_t ofi_prov_id) -{ - orte_rml_ofi_msg_header_t msg_hdr; - uint32_t msglen, datalen = 0; - char *data, *totdata, *nextpkt; - ofi_recv_msg_queue_t *recv_msg_queue, *new_msg; - orte_rml_ofi_recv_pkt_t *ofi_recv_pkt, *new_pkt, *next; - bool msg_in_queue = false; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s orte_rml_ofi_recv_handler called ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - /*copy the header and data from buffer and pass it on - ** since this is the ofi_prov recv buffer don't want it to be released as - ** considering re-using it, so for now copying to newly allocated *data - ** the *data will be released by orte_rml_base functions */ - - memcpy(&msg_hdr,wc->buf,sizeof(orte_rml_ofi_msg_header_t)); - msglen = wc->len - sizeof(orte_rml_ofi_msg_header_t); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Received packet -> msg id = %d wc->len = %lu, msglen = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid, wc->len, msglen ); - data = (char *)malloc(msglen); - memcpy(data,((char *)wc->buf+sizeof(orte_rml_ofi_msg_header_t)),msglen); - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s header info of received packet -> cur_pkt_num = %d, tot_pkts = %d ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.cur_pkt_num, msg_hdr.tot_pkts ); - /* To accomodate message bigger than recv buffer size, - check if current message is in multiple blocks and append them before sending it to RML */ - if ( msg_hdr.tot_pkts == 1) { - /* Since OFI is point-to-point, no need to check if the intended destination is me - send to RML */ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Posting Recv for msgid %d, from peer - %s , Tag = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid, ORTE_NAME_PRINT(&msg_hdr.origin),msg_hdr.tag ); - ORTE_RML_POST_MESSAGE(&msg_hdr.origin, msg_hdr.tag, msg_hdr.seq_num,data,msglen); - } else { - msg_in_queue = false; - new_pkt = OBJ_NEW(orte_rml_ofi_recv_pkt_t); - new_pkt->cur_pkt_num = msg_hdr.cur_pkt_num; - new_pkt->pkt_size = msglen; - new_pkt->data = data; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Just beofe checking if this message-pkt is already in queue. msgid-%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); - /* check if the queue has the [msgid,sender] entry */ - OPAL_LIST_FOREACH(recv_msg_queue, &orte_rml_ofi.recv_msg_queue_list, ofi_recv_msg_queue_t) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Checking msgid-%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid ); - if( (recv_msg_queue->msgid == msg_hdr.msgid) && (recv_msg_queue->sender.jobid == msg_hdr.origin.jobid) - && (recv_msg_queue->sender.vpid == msg_hdr.origin.vpid) ) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Found Msg entry in queue for msgid %d, sender jobid=%d, sender vpid=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, recv_msg_queue->sender.vpid); - msg_in_queue = true; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s msgid %d, tot_pkts=%d, opal_list_get_size()=%lu,total pkt_recd=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->tot_pkts, - opal_list_get_size(&recv_msg_queue->pkt_list), recv_msg_queue->pkt_recd ); - if( recv_msg_queue->tot_pkts == (recv_msg_queue->pkt_recd +1) ) { - /* all packets received for this message - post message to rml and remove this from queue */ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s All packets recd for msgid %d, tot_pkts=%d, opal_list_get_size()=%lu,total pkt_recd=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->tot_pkts, - opal_list_get_size(&recv_msg_queue->pkt_list), recv_msg_queue->pkt_recd ); - totdata = NULL; - datalen = 0; - OPAL_LIST_FOREACH(ofi_recv_pkt, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Adding data for packet %d, pktlength = %lu, cumulative datalen so far = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num, ofi_recv_pkt->pkt_size, datalen ); - if (0 == datalen) { - totdata = (char *)malloc(ofi_recv_pkt->pkt_size); - if( totdata == NULL) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Error: malloc failed for msgid %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),recv_msg_queue->msgid ); - return 1; //[TODO: error-handling needs to be implemented - } - memcpy(totdata,ofi_recv_pkt->data,ofi_recv_pkt->pkt_size); - - } else { - totdata = realloc(totdata,datalen+ofi_recv_pkt->pkt_size); - if (NULL != totdata ) { - memcpy((totdata+datalen),ofi_recv_pkt->data,ofi_recv_pkt->pkt_size); - } else { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Error: realloc failed for msgid %d, from sender jobid=%d, sender vpid=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, - recv_msg_queue->sender.vpid); - return 1; //[TODO: error-handling needs to be implemented - } - } - datalen += ofi_recv_pkt->pkt_size; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s packet %d done, datalen = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num,datalen); - } - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Adding leftover data recd, datalen = %d, new_pkt->pkt_size = %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); - //add the last packet - totdata =realloc(totdata,datalen+new_pkt->pkt_size); - if( NULL != totdata ) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Realloc completed for leftover data recd, datalen = %d, new->pkt->pkt_size = %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); - nextpkt = totdata+datalen; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s totdata = %p,nextpkt = %p ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), totdata, nextpkt); - memcpy(nextpkt,new_pkt->data,new_pkt->pkt_size); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s memcpy completed for leftover data recd, datalen = %d, new->pkt->pkt_size = %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen, new_pkt->pkt_size); - datalen += new_pkt->pkt_size; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Posting Recv for msgid %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); - ORTE_RML_POST_MESSAGE(&msg_hdr.origin, msg_hdr.tag, msg_hdr.seq_num,totdata,datalen);\ - - // free the pkts - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s msgid %d - posting recv completed, freeing packets", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr.msgid ); - OPAL_LIST_FOREACH_SAFE(ofi_recv_pkt, next, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { - free( ofi_recv_pkt->data); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s freed data for packet %d",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_recv_pkt->cur_pkt_num ); - ofi_recv_pkt->pkt_size=0; - opal_list_remove_item(&recv_msg_queue->pkt_list, &ofi_recv_pkt->super); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Removed pkt from list ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - OBJ_RELEASE(ofi_recv_pkt); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Released packet ",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - } - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s freeing packets completed",ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - //free the msg from the queue-list - opal_list_remove_item(&orte_rml_ofi.recv_msg_queue_list,&recv_msg_queue->super); - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Successfully removed msg from queue", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - OBJ_RELEASE(recv_msg_queue); - } else { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Error: realloc failed for msgid %d, from sender jobid=%d, sender vpid=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, recv_msg_queue->sender.jobid, - recv_msg_queue->sender.vpid); - return 1; //[TODO: error-handling needs to be implemented - } - } else { - /* add this packet to the msg in the queue ordered by cur_pkt_num */ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Adding packet to list, msgid %d, pkt - %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg_queue->msgid, msg_hdr.cur_pkt_num ); - - bool pkt_added = false; - OPAL_LIST_FOREACH(ofi_recv_pkt, &recv_msg_queue->pkt_list, orte_rml_ofi_recv_pkt_t) { - if( msg_hdr.cur_pkt_num < ofi_recv_pkt->cur_pkt_num ) { - opal_list_insert_pos(&recv_msg_queue->pkt_list, (opal_list_item_t*)ofi_recv_pkt, &new_pkt->super); - recv_msg_queue->pkt_recd++; - pkt_added = true; - break; - } - } - if (!pkt_added) { - opal_list_append(&recv_msg_queue->pkt_list,&new_pkt->super); - recv_msg_queue->pkt_recd++; - } - } - } - break; //we found the msg or added it so exit out of the msg_queue loop - } - if( !msg_in_queue ) { - /*add to the queue as this is the first packet for [msgid,sender] */ - new_msg = OBJ_NEW(ofi_recv_msg_queue_t); - new_msg->msgid = msg_hdr.msgid; - new_msg->sender = msg_hdr.origin; - new_msg->tot_pkts = msg_hdr.tot_pkts; - new_msg->pkt_recd = 1; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Adding first Msg entry in queue for msgid %d, sender jobid=%d, sender vpid=%d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), new_msg->msgid, new_msg->sender.jobid, new_msg->sender.vpid); - opal_list_append(&new_msg->pkt_list, &new_pkt->super); - opal_list_append(&orte_rml_ofi.recv_msg_queue_list, &new_msg->super); - - } - } - return ORTE_SUCCESS; -} - - -static void send_msg(int fd, short args, void *cbdata) -{ - ofi_send_request_t *req = (ofi_send_request_t*)cbdata; - orte_process_name_t *peer = &(req->send.dst); - orte_rml_tag_t tag = req->send.tag; - char *dest_ep_name, *pmix_key; - size_t dest_ep_namelen = 0; - int ret = OPAL_ERROR; - uint32_t total_packets; - fi_addr_t dest_fi_addr; - orte_rml_send_t *snd; - orte_rml_recv_t *rcv; - orte_self_send_xfer_t *xfer; - orte_rml_ofi_request_t* ofi_send_req = OBJ_NEW( orte_rml_ofi_request_t ); - uint8_t ofi_prov_id = req->ofi_prov_id; - orte_rml_ofi_send_pkt_t* ofi_msg_pkt; - size_t datalen_per_pkt, hdrsize, data_in_pkt; // the length of data in per packet excluding the header size - orte_rml_ofi_peer_t* pr; - uint64_t ui64; - struct sockaddr_in* ep_sockaddr; - int i, bytes; - char *ptr; - - snd = OBJ_NEW(orte_rml_send_t); - snd->dst = *peer; - snd->origin = *ORTE_PROC_MY_NAME; - snd->tag = tag; - if (NULL != req->send.iov) { - snd->iov = req->send.iov; - snd->count = req->send.count; - snd->cbfunc.iov = req->send.cbfunc.iov; - } else { - snd->buffer = req->send.buffer; - snd->cbfunc.buffer = req->send.cbfunc.buffer; - } - snd->cbdata = req->send.cbdata; - - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s send_msg_transport to peer %s at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), tag); - - - /* get the peer address by doing modex_receive */ - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s calling OPAL_MODEX_RECV_STRING ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - // if dest is same as me then instead of doing lookup just populate the dest_ep_name - /*if (!ORTE_PROC_IS_APP && peer->jobid == ORTE_PROC_MY_NAME->jobid && peer->vpid == ORTE_PROC_MY_NAME->vpid) { - dest_ep_namelen = orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen; - dest_ep_name = (char *)calloc(dest_ep_namelen,sizeof(char)); - memcpy( dest_ep_name, orte_rml_ofi.ofi_prov[ofi_prov_id].ep_name,dest_ep_namelen); - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi: send and dest are same so proceeding with cur provider ep_name ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - ret = OPAL_SUCCESS; - } else {*/ - if (ORTE_PROC_IS_APP ) { - asprintf(&pmix_key,"%s%d",orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->fabric_attr->prov_name,ofi_prov_id); - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s calling OPAL_MODEX_RECV_STRING for ORTE_PROC_APP peer - %s, key - %s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer),pmix_key ); - OPAL_MODEX_RECV_STRING(ret, pmix_key, peer , (uint8_t **) &dest_ep_name, &dest_ep_namelen); - opal_output_verbose(10, orte_rml_base_framework.framework_output, "Returned from MODEX_RECV"); - opal_output_verbose(50, orte_rml_base_framework.framework_output, - "%s Return value from OPAL_MODEX_RECV_STRING - %d, length returned - %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ret, dest_ep_namelen); - free(pmix_key); - } else { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s calling OPAL_MODEX_RECV_STRING for DAEMON peer %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer)); - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME)) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml_ofi_send_to_self at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tag); - /* send to self is a tad tricky - we really don't want - * to track the send callback function throughout the recv - * process and execute it upon receipt as this would provide - * very different timing from a non-self message. Specifically, - * if we just retain a pointer to the incoming data - * and then execute the send callback prior to the receive, - * then the caller will think we are done with the data and - * can release it. So we have to copy the data in order to - * execute the send callback prior to receiving the message. - * - * In truth, this really is a better mimic of the non-self - * message behavior. If we actually pushed the message out - * on the wire and had it loop back, then we would receive - * a new block of data anyway. - */ - /* setup the send callback */ - xfer = OBJ_NEW(orte_self_send_xfer_t); - if (NULL != req->send.iov) { - xfer->iov = req->send.iov; - xfer->count = req->send.count; - xfer->cbfunc.iov = req->send.cbfunc.iov; - } else { - xfer->buffer = req->send.buffer; - xfer->cbfunc.buffer = req->send.cbfunc.buffer; - } - xfer->tag = tag; - xfer->cbdata = req->send.cbdata; - /* setup the event for the send callback */ - opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer); - opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI); - opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1); - - /* copy the message for the recv */ - rcv = OBJ_NEW(orte_rml_recv_t); - rcv->sender = *peer; - rcv->tag = tag; - if (NULL != req->send.iov) { - /* get the total number of bytes in the iovec array */ - bytes = 0; - for (i = 0 ; i < req->send.count ; ++i) { - bytes += req->send.iov[i].iov_len; - } - /* get the required memory allocation */ - if (0 < bytes) { - rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(bytes); - rcv->iov.iov_len = bytes; - /* transfer the bytes */ - ptr = (char*)rcv->iov.iov_base; - for (i = 0 ; i < req->send.count ; ++i) { - memcpy(ptr, req->send.iov[i].iov_base, req->send.iov[i].iov_len); - ptr += req->send.iov[i].iov_len; - } - } - } else if (0 < req->send.buffer->bytes_used) { - rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(req->send.buffer->bytes_used); - memcpy(rcv->iov.iov_base, req->send.buffer->base_ptr, req->send.buffer->bytes_used); - rcv->iov.iov_len = req->send.buffer->bytes_used; - } - /* post the message for receipt - since the send callback was posted - * first and has the same priority, it will execute first - */ - ORTE_RML_ACTIVATE_MESSAGE(rcv); - OBJ_RELEASE(req); - return; - } else { - memcpy(&ui64, (char*)peer, sizeof(uint64_t)); - if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_rml_ofi.peers, - ui64, (void**)&pr) || NULL == pr) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi: Send failed to get peer OFI contact info ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s rml:ofi: OFI peer contact info got from hash table", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - dest_ep_name = pr->ofi_ep; - dest_ep_namelen = pr->ofi_ep_len; - ret = OPAL_SUCCESS; - } - } - if ( OPAL_SUCCESS == ret) { - //Anandhi added for debug purpose - switch ( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->addr_format) - { - case FI_SOCKADDR_IN : - /* Address is of type sockaddr_in (IPv4) */ - /*[debug] - print the sockaddr - port and s_addr */ - ep_sockaddr = (struct sockaddr_in*)dest_ep_name; - opal_output_verbose(1,orte_rml_base_framework.framework_output, - "%s peer %s epnamelen is %lu, port = %d (or) 0x%x, InternetAddr = 0x%s ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ORTE_NAME_PRINT(peer), - orte_rml_ofi.ofi_prov[ofi_prov_id].epnamelen,ntohs(ep_sockaddr->sin_port), - ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr)); - /*[end debug]*/ - break; - } - //Anandhi end debug - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s OPAL_MODEX_RECV succeded, %s peer ep name obtained. length=%lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), dest_ep_namelen); - ret = fi_av_insert(orte_rml_ofi.ofi_prov[ofi_prov_id].av, dest_ep_name,1,&dest_fi_addr,0,NULL); - if( ret != 1) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s fi_av_insert failed in send_msg() returned %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ret ); - /* call the send-callback fn with error and return, also return failure status */ - snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; - - ORTE_RML_SEND_COMPLETE(snd); - return; - } - } else { - - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s OPAL_MODEX_RECV failed to obtain %s peer ep name ", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer)); - /* call the send-callback fn with error and return, also return failure status */ - snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN; - ORTE_RML_SEND_COMPLETE(snd); - //OBJ_RELEASE( ofi_send_req); - return; - } - - ofi_send_req->send = snd; - ofi_send_req->completion_count = 1; - - /* [DESC] we want to send the pid,seqnum,tag in addition to the data - * copy all of this to header of message from the ofi_send_t* send - */ - ofi_send_req->hdr.dst = ofi_send_req->send->dst; - ofi_send_req->hdr.origin = ofi_send_req->send->origin; - ofi_send_req->hdr.seq_num = ofi_send_req->send->seq_num; - ofi_send_req->hdr.tag = ofi_send_req->send->tag; - - /* - * also insert ofi plugin specific header details - - * the unique msgid, for now initalise total_packets to 1 - */ - ofi_send_req->hdr.msgid = orte_rml_ofi.cur_msgid; - orte_rml_ofi.cur_msgid += 1; - total_packets = 1; - - /* copy the buffer/iov/data to the ofi_send_req->datablob and update ofi_send_req->length*/ - ofi_send_req->length = 0; - if( NULL != ofi_send_req->send->buffer) { - ofi_send_req->length = ofi_send_req->send->buffer->bytes_used; - ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); - memcpy(ofi_send_req->data_blob , - ofi_send_req->send->buffer->base_ptr, - ofi_send_req->send->buffer->bytes_used); - } else if ( NULL != ofi_send_req->send->iov) { - for (int i=0; i < ofi_send_req->send->count; i++) { - ofi_send_req->length += ofi_send_req->send->iov[i].iov_len; - } - ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); - int iovlen=0; - for (int i=0; i < ofi_send_req->send->count; i++) { - memcpy(((char *)ofi_send_req->data_blob + iovlen ), - ofi_send_req->send->iov[i].iov_base, - ofi_send_req->send->iov[i].iov_len); - iovlen += ofi_send_req->send->iov[i].iov_len; - } - } else { - //just send the data - ofi_send_req->length = ofi_send_req->send->count; - ofi_send_req->data_blob = (char *)malloc(ofi_send_req->length); - memcpy(ofi_send_req->data_blob , - ofi_send_req->send->data, - ofi_send_req->send->count); - } - - - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Completed copying all data into ofi_send_req->data_blob, total data - %lu bytes", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_send_req->length ); - - /* Each packet will have header information, so the data length in each packet is datalen_per_packet. - * check if the ofi_send_req->send->buffer->bytes_used is greater than the data per packet datalen_per_packet(recv buffer) - * if so fragment and add info to header and send it in a loop back-to-back */ - hdrsize = sizeof(orte_rml_ofi_msg_header_t); - datalen_per_pkt = MIN_MULTI_BUF_SIZE - hdrsize; - if (ofi_send_req->length > datalen_per_pkt ) - { - total_packets = ( ofi_send_req->length / datalen_per_pkt ) + 1 ; - } - ofi_send_req->hdr.tot_pkts = total_packets; - - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s datalen_per_pkt = %lu, ofi_send_req->length= %lu, total packets = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), datalen_per_pkt, ofi_send_req->length, total_packets ); - - /* in a loop send create and send the packets */ - for(size_t pkt_num=1,sent_data=0; sent_data < ofi_send_req->length; pkt_num++) { - ofi_send_req->hdr.cur_pkt_num = pkt_num; - /* create the packet */ - ofi_msg_pkt = OBJ_NEW(orte_rml_ofi_send_pkt_t); - data_in_pkt = ((ofi_send_req->length - sent_data) >= datalen_per_pkt) ? - datalen_per_pkt : (ofi_send_req->length - sent_data); - ofi_msg_pkt->pkt_size = hdrsize + data_in_pkt; - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Packet %lu -> data_in_pkt= %lu, header_size= %lu, pkt_size=%lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), pkt_num,data_in_pkt,hdrsize,ofi_msg_pkt->pkt_size ); - /* copy the header and data for this pkt */ - ofi_msg_pkt->data = malloc( ofi_msg_pkt->pkt_size); - memcpy(ofi_msg_pkt->data, &ofi_send_req->hdr, hdrsize ); - memcpy( ( (char *)ofi_msg_pkt->data + hdrsize ), - ((char*)ofi_send_req->data_blob + sent_data), - data_in_pkt); - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s Copying header, data into packets completed", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ); - /* add it to list */ - opal_list_append(&(ofi_send_req->pkt_list), &ofi_msg_pkt->super); - opal_output_verbose(15, orte_rml_base_framework.framework_output, - "%s adding packet %lu to list done successful", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),pkt_num ); - sent_data += data_in_pkt; - } - - if( ofi_send_req->hdr.tot_pkts != ofi_send_req->hdr.cur_pkt_num ) { - opal_output_verbose(1, orte_rml_base_framework.framework_output, - "%s Error: Total packets calculated [%d] does not match total created-%d pkts to peer %s with tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ofi_send_req->hdr.tot_pkts, ofi_send_req->hdr.cur_pkt_num, - ORTE_NAME_PRINT(peer), tag); - } - /* do the fi_send() for all the pkts */ - ofi_send_req->completion_count= ofi_send_req->hdr.tot_pkts; - OPAL_LIST_FOREACH(ofi_msg_pkt, &ofi_send_req->pkt_list, orte_rml_ofi_send_pkt_t) { - /* debug purpose - copying the header from packet to verify if it is correct */ - struct orte_rml_ofi_msg_header_t *cur_hdr; - cur_hdr = (struct orte_rml_ofi_msg_header_t* ) ofi_msg_pkt->data; - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s Sending Pkt[%d] of total %d pkts for msgid:%d to peer %s with tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_hdr->cur_pkt_num, ofi_send_req->completion_count, - cur_hdr->msgid, ORTE_NAME_PRINT(peer), tag); - /* end debug*/ - - RML_OFI_RETRY_UNTIL_DONE(fi_send(orte_rml_ofi.ofi_prov[ofi_prov_id].ep, - ofi_msg_pkt->data, - ofi_msg_pkt->pkt_size, - fi_mr_desc(orte_rml_ofi.ofi_prov[ofi_prov_id].mr_multi_recv), - dest_fi_addr, - (void *)&ofi_send_req->ctx)); - - } - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s End of send_msg_transport. fi_send completed to peer %s with tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), tag); - OBJ_RELEASE(req); -} - -int orte_rml_ofi_send_nb(struct orte_rml_base_module_t* mod, - orte_process_name_t* peer, - struct iovec* iov, - int count, - orte_rml_tag_t tag, - orte_rml_callback_fn_t cbfunc, - void* cbdata) -{ - ofi_send_request_t *req; - orte_rml_ofi_module_t *ofi_mod = (orte_rml_ofi_module_t*)mod; - int ofi_prov_id = ofi_mod->cur_transport_id; - - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s rml_ofi_send_transport to peer %s at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), tag); - - - if( (0 > ofi_prov_id) || ( ofi_prov_id >= orte_rml_ofi.ofi_prov_open_num ) ) { - /* Invalid ofi_prov ID provided */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (ORTE_RML_TAG_INVALID == tag) { - /* cannot send to an invalid tag */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (NULL == peer || - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer)) { - /* cannot send to an invalid peer */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - /* get ourselves into an event to protect against - * race conditions and threads - */ - req = OBJ_NEW(ofi_send_request_t); - req->ofi_prov_id = ofi_prov_id; - req->send.dst = *peer; - req->send.iov = iov; - req->send.count = count; - req->send.tag = tag; - req->send.cbfunc.iov = cbfunc; - req->send.cbdata = cbdata; - - /* setup the event for the send callback */ - opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); - opal_event_set_priority(&req->ev, ORTE_MSG_PRI); - opal_event_active(&req->ev, OPAL_EV_WRITE, 1); - - return ORTE_SUCCESS; -} - - -int orte_rml_ofi_send_buffer_nb(struct orte_rml_base_module_t *mod, - orte_process_name_t* peer, - struct opal_buffer_t* buffer, - orte_rml_tag_t tag, - orte_rml_buffer_callback_fn_t cbfunc, - void* cbdata) -{ - ofi_send_request_t *req; - orte_rml_ofi_module_t *ofi_mod = (orte_rml_ofi_module_t*)mod; - int ofi_prov_id = ofi_mod->cur_transport_id; - - opal_output_verbose(10, orte_rml_base_framework.framework_output, - "%s rml_ofi_send_buffer_transport to peer %s at tag %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(peer), tag); - - - if( (0 > ofi_prov_id) || ( ofi_prov_id >= orte_rml_ofi.ofi_prov_open_num ) ) { - /* Invalid ofi_prov ID provided */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (ORTE_RML_TAG_INVALID == tag) { - /* cannot send to an invalid tag */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - if (NULL == peer || - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer)) { - /* cannot send to an invalid peer */ - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - /* get ourselves into an event to protect against - * race conditions and threads - */ - req = OBJ_NEW(ofi_send_request_t); - req->ofi_prov_id = ofi_prov_id; - req->send.dst = *peer; - req->send.buffer = buffer; - req->send.tag = tag; - req->send.cbfunc.buffer = cbfunc; - req->send.cbdata = cbdata; - - /* setup the event for the send callback */ - opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); - opal_event_set_priority(&req->ev, ORTE_MSG_PRI); - opal_event_active(&req->ev, OPAL_EV_WRITE, 1); - - return ORTE_SUCCESS; -} diff --git a/orte/mca/rml/oob/rml_oob_send.c b/orte/mca/rml/oob/rml_oob_send.c index be113594864..7b56c60bdae 100644 --- a/orte/mca/rml/oob/rml_oob_send.c +++ b/orte/mca/rml/oob/rml_oob_send.c @@ -99,8 +99,6 @@ int orte_rml_oob_send_nb(struct orte_rml_base_module_t *mod, return ORTE_ERR_BAD_PARAM; } - OPAL_TIMING_EVENT((&tm_rml, "to %s", ORTE_NAME_PRINT(peer))); - /* if this is a message to myself, then just post the message * for receipt - no need to dive into the oob */ @@ -207,8 +205,6 @@ int orte_rml_oob_send_buffer_nb(struct orte_rml_base_module_t *mod, return ORTE_ERR_BAD_PARAM; } - OPAL_TIMING_EVENT((&tm_rml, "to %s", ORTE_NAME_PRINT(peer))); - /* if this is a message to myself, then just post the message * for receipt - no need to dive into the oob */ diff --git a/orte/mca/routed/direct/routed_direct.c b/orte/mca/routed/direct/routed_direct.c index 9024f62f078..ddcad934b69 100644 --- a/orte/mca/routed/direct/routed_direct.c +++ b/orte/mca/routed/direct/routed_direct.c @@ -4,7 +4,7 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,14 +76,16 @@ static int init(void) lifeline = NULL; if (ORTE_PROC_IS_DAEMON) { + ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; /* if we are using static ports, set my lifeline to point at my parent */ if (orte_static_ports) { + /* we will have been given our parent's vpid by MCA param */ lifeline = ORTE_PROC_MY_PARENT; } else { /* set our lifeline to the HNP - we will abort if that connection is lost */ lifeline = ORTE_PROC_MY_HNP; + ORTE_PROC_MY_PARENT->vpid = 0; } - ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid; } else if (ORTE_PROC_IS_APP) { /* if we don't have a designated daemon, just * disqualify ourselves */ @@ -359,4 +361,3 @@ static int direct_ft_event(int state) return exit_status; } #endif - diff --git a/orte/mca/schizo/base/base.h b/orte/mca/schizo/base/base.h index ad5d9ffc63d..8f5ab569ee1 100644 --- a/orte/mca/schizo/base/base.h +++ b/orte/mca/schizo/base/base.h @@ -76,7 +76,7 @@ ORTE_DECLSPEC int orte_schizo_base_setup_child(orte_job_t *jobdat, orte_app_context_t *app, char ***env); ORTE_DECLSPEC orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void); -ORTE_DECLSPEC long orte_schizo_base_get_remaining_time(void); +ORTE_DECLSPEC int orte_schizo_base_get_remaining_time(uint32_t *timeleft); ORTE_DECLSPEC void orte_schizo_base_finalize(void); END_C_DECLS diff --git a/orte/mca/schizo/base/schizo_base_stubs.c b/orte/mca/schizo/base/schizo_base_stubs.c index 173ca1c2bf3..8b7068434e3 100644 --- a/orte/mca/schizo/base/schizo_base_stubs.c +++ b/orte/mca/schizo/base/schizo_base_stubs.c @@ -162,20 +162,20 @@ orte_schizo_launch_environ_t orte_schizo_base_check_launch_environment(void) return ORTE_SCHIZO_UNDETERMINED; } -long orte_schizo_base_get_remaining_time(void) +int orte_schizo_base_get_remaining_time(uint32_t *timeleft) { - long rc; + int rc; orte_schizo_base_active_module_t *mod; OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (NULL != mod->module->get_remaining_time) { - rc = mod->module->get_remaining_time(); + rc = mod->module->get_remaining_time(timeleft); if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { return rc; } } } - return -1; + return ORTE_ERR_NOT_SUPPORTED; } void orte_schizo_base_finalize(void) diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index 1bd42f4e435..b0e77f37cb1 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2017 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2017 Oak Ridge National Labs. All rights reserved. @@ -75,96 +75,109 @@ orte_schizo_base_module_t orte_schizo_ompi_module = { static opal_cmd_line_init_t cmd_line_init[] = { /* Various "obvious" options */ - { NULL, 'h', NULL, "help", 0, - &orte_cmd_options.help, OPAL_CMD_LINE_TYPE_BOOL, - "This help message" }, + { NULL, 'h', NULL, "help", 1, + &orte_cmd_options.help, OPAL_CMD_LINE_TYPE_STRING, + "This help message", OPAL_CMD_LINE_OTYPE_GENERAL }, { NULL, 'V', NULL, "version", 0, &orte_cmd_options.version, OPAL_CMD_LINE_TYPE_BOOL, - "Print version and exit" }, + "Print version and exit", OPAL_CMD_LINE_OTYPE_GENERAL }, { NULL, 'v', NULL, "verbose", 0, &orte_cmd_options.verbose, OPAL_CMD_LINE_TYPE_BOOL, - "Be verbose" }, + "Be verbose", OPAL_CMD_LINE_OTYPE_GENERAL }, { "orte_execute_quiet", 'q', NULL, "quiet", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Suppress helpful messages" }, + "Suppress helpful messages", OPAL_CMD_LINE_OTYPE_GENERAL }, { NULL, '\0', "report-pid", "report-pid", 1, &orte_cmd_options.report_pid, OPAL_CMD_LINE_TYPE_STRING, - "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, + "Printout pid on stdout [-], stderr [+], or a file [anything else]", + OPAL_CMD_LINE_OTYPE_DEBUG }, { NULL, '\0', "report-uri", "report-uri", 1, &orte_cmd_options.report_uri, OPAL_CMD_LINE_TYPE_STRING, - "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, + "Printout URI on stdout [-], stderr [+], or a file [anything else]", + OPAL_CMD_LINE_OTYPE_DEBUG }, /* testing options */ { NULL, '\0', "timeout", "timeout", 1, &orte_cmd_options.timeout, OPAL_CMD_LINE_TYPE_INT, - "Timeout the job after the specified number of seconds" }, + "Timeout the job after the specified number of seconds", + OPAL_CMD_LINE_OTYPE_DEBUG }, { NULL, '\0', "report-state-on-timeout", "report-state-on-timeout", 0, &orte_cmd_options.report_state_on_timeout, OPAL_CMD_LINE_TYPE_BOOL, - "Report all job and process states upon timeout" }, + "Report all job and process states upon timeout", + OPAL_CMD_LINE_OTYPE_DEBUG }, { NULL, '\0', "get-stack-traces", "get-stack-traces", 0, &orte_cmd_options.get_stack_traces, OPAL_CMD_LINE_TYPE_BOOL, - "Get stack traces of all application procs on timeout" }, + "Get stack traces of all application procs on timeout", + OPAL_CMD_LINE_OTYPE_DEBUG }, /* exit status reporting */ { "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Return the exit status of the primary job only" }, + "Return the exit status of the primary job only", OPAL_CMD_LINE_OTYPE_OUTPUT }, /* uri of the dvm, or at least where to get it */ { NULL, '\0', "hnp", "hnp", 1, &orte_cmd_options.hnp, OPAL_CMD_LINE_TYPE_STRING, - "Specify the URI of the HNP, or the name of the file (specified as file:filename) that contains that info" }, + "Specify the URI of the HNP, or the name of the file (specified as file:filename) that contains that info", + OPAL_CMD_LINE_OTYPE_DVM }, /* select XML output */ { "orte_xml_output", '\0', "xml", "xml", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Provide all output in XML format" }, + "Provide all output in XML format", OPAL_CMD_LINE_OTYPE_OUTPUT }, { "orte_xml_file", '\0', "xml-file", "xml-file", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide all output in XML format to the specified file" }, + "Provide all output in XML format to the specified file", OPAL_CMD_LINE_OTYPE_OUTPUT }, /* tag output */ { "orte_tag_output", '\0', "tag-output", "tag-output", 0, &orte_cmd_options.tag_output, OPAL_CMD_LINE_TYPE_BOOL, - "Tag all output with [job,rank]" }, + "Tag all output with [job,rank]", OPAL_CMD_LINE_OTYPE_OUTPUT }, { "orte_timestamp_output", '\0', "timestamp-output", "timestamp-output", 0, &orte_cmd_options.timestamp_output, OPAL_CMD_LINE_TYPE_BOOL, - "Timestamp all application process output" }, + "Timestamp all application process output", OPAL_CMD_LINE_OTYPE_OUTPUT }, { "orte_output_filename", '\0', "output-filename", "output-filename", 1, &orte_cmd_options.output_filename, OPAL_CMD_LINE_TYPE_STRING, - "Redirect output from application processes into filename/job/rank/std[out,err,diag]" }, + "Redirect output from application processes into filename/job/rank/std[out,err,diag]", + OPAL_CMD_LINE_OTYPE_OUTPUT }, { NULL, '\0', "merge-stderr-to-stdout", "merge-stderr-to-stdout", 0, &orte_cmd_options.merge, OPAL_CMD_LINE_TYPE_BOOL, - "Merge stderr to stdout for each process"}, + "Merge stderr to stdout for each process", OPAL_CMD_LINE_OTYPE_OUTPUT }, { "orte_xterm", '\0', "xterm", "xterm", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Create a new xterm window and display output from the specified ranks there" }, + "Create a new xterm window and display output from the specified ranks there", + OPAL_CMD_LINE_OTYPE_OUTPUT }, /* select stdin option */ { NULL, '\0', "stdin", "stdin", 1, &orte_cmd_options.stdin_target, OPAL_CMD_LINE_TYPE_STRING, - "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" }, + "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)", + OPAL_CMD_LINE_OTYPE_INPUT }, /* request that argv[0] be indexed */ { NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0, &orte_cmd_options.index_argv, OPAL_CMD_LINE_TYPE_BOOL, - "Uniquely index argv[0] for each process using its rank" }, + "Uniquely index argv[0] for each process using its rank", + OPAL_CMD_LINE_OTYPE_INPUT }, /* Specify the launch agent to be used */ { "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Command used to start processes on remote nodes (default: orted)" }, + "Command used to start processes on remote nodes (default: orted)", + OPAL_CMD_LINE_OTYPE_LAUNCH }, /* Preload the binary on the remote machine */ { NULL, 's', NULL, "preload-binary", 0, &orte_cmd_options.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL, - "Preload the binary on the remote machine before starting the remote process." }, + "Preload the binary on the remote machine before starting the remote process.", + OPAL_CMD_LINE_OTYPE_LAUNCH }, /* Preload files on the remote machine */ { NULL, '\0', NULL, "preload-files", 1, &orte_cmd_options.preload_files, OPAL_CMD_LINE_TYPE_STRING, - "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, + "Preload the comma separated list of files to the remote machines current working directory before starting the remote process.", + OPAL_CMD_LINE_OTYPE_LAUNCH }, #if OPAL_ENABLE_FT_CR == 1 /* Tell SStore to preload a snapshot before launch */ @@ -176,248 +189,287 @@ static opal_cmd_line_init_t cmd_line_init[] = { /* Use an appfile */ { NULL, '\0', NULL, "app", 1, &orte_cmd_options.appfile, OPAL_CMD_LINE_TYPE_STRING, - "Provide an appfile; ignore all other command line options" }, + "Provide an appfile; ignore all other command line options", + OPAL_CMD_LINE_OTYPE_LAUNCH }, /* Number of processes; -c, -n, --n, -np, and --np are all synonyms */ { NULL, 'c', "np", "np", 1, &orte_cmd_options.num_procs, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, + "Number of processes to run", OPAL_CMD_LINE_OTYPE_GENERAL }, { NULL, '\0', "n", "n", 1, &orte_cmd_options.num_procs, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, + "Number of processes to run", OPAL_CMD_LINE_OTYPE_GENERAL }, /* maximum size of VM - typically used to subdivide an allocation */ { "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1, NULL, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, + "Number of processes to run", OPAL_CMD_LINE_OTYPE_DVM }, /* Set a hostfile */ { NULL, '\0', "hostfile", "hostfile", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile" }, + "Provide a hostfile", OPAL_CMD_LINE_OTYPE_LAUNCH }, { NULL, '\0', "machinefile", "machinefile", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile" }, + "Provide a hostfile", OPAL_CMD_LINE_OTYPE_LAUNCH }, { "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a default hostfile" }, + "Provide a default hostfile", OPAL_CMD_LINE_OTYPE_LAUNCH }, { "opal_if_do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Do not attempt to resolve interfaces" }, + "Do not attempt to resolve interfaces", OPAL_CMD_LINE_OTYPE_DEVEL }, /* uri of PMIx publish/lookup server, or at least where to get it */ { "pmix_server_uri", '\0', "ompi-server", "ompi-server", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Specify the URI of the publish/lookup server, or the name of the file (specified as file:filename) that contains that info" }, + "Specify the URI of the publish/lookup server, or the name of the file (specified as file:filename) that contains that info", + OPAL_CMD_LINE_OTYPE_DVM }, { "carto_file_path", '\0', "cf", "cartofile", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a cartography file" }, + "Provide a cartography file", OPAL_CMD_LINE_OTYPE_MAPPING }, { "orte_rankfile", '\0', "rf", "rankfile", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a rankfile file" }, + "Provide a rankfile file", OPAL_CMD_LINE_OTYPE_MAPPING }, /* Export environment variables; potentially used multiple times, so it does not make sense to set into a variable */ { NULL, 'x', NULL, NULL, 1, NULL, OPAL_CMD_LINE_TYPE_NULL, - "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, + "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)", OPAL_CMD_LINE_OTYPE_LAUNCH }, /* Mapping controls */ { "rmaps_base_display_map", '\0', "display-map", "display-map", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the process map just before launch"}, + "Display the process map just before launch", OPAL_CMD_LINE_OTYPE_DEBUG }, { "rmaps_base_display_devel_map", '\0', "display-devel-map", "display-devel-map", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a detailed process map (mostly intended for developers) just before launch"}, + "Display a detailed process map (mostly intended for developers) just before launch", + OPAL_CMD_LINE_OTYPE_DEVEL }, { "rmaps_base_display_topo_with_map", '\0', "display-topo", "display-topo", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the topology as part of the process map (mostly intended for developers) just before launch"}, + "Display the topology as part of the process map (mostly intended for developers) just before launch", + OPAL_CMD_LINE_OTYPE_DEVEL }, { "rmaps_base_display_diffable_map", '\0', "display-diffable-map", "display-diffable-map", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a diffable process map (mostly intended for developers) just before launch"}, + "Display a diffable process map (mostly intended for developers) just before launch", + OPAL_CMD_LINE_OTYPE_DEVEL }, { NULL, 'H', "host", "host", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "List of hosts to invoke processes on" }, + "List of hosts to invoke processes on", + OPAL_CMD_LINE_OTYPE_MAPPING }, { "rmaps_base_no_schedule_local", '\0', "nolocal", "nolocal", 0, &orte_cmd_options.nolocal, OPAL_CMD_LINE_TYPE_BOOL, - "Do not run any MPI applications on the local node" }, + "Do not run any MPI applications on the local node", + OPAL_CMD_LINE_OTYPE_MAPPING }, { "rmaps_base_no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, &orte_cmd_options.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes are not to be oversubscribed, even if the system supports such operation"}, + "Nodes are not to be oversubscribed, even if the system supports such operation", + OPAL_CMD_LINE_OTYPE_MAPPING }, { "rmaps_base_oversubscribe", '\0', "oversubscribe", "oversubscribe", 0, &orte_cmd_options.oversubscribe, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements"}, + "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements", + OPAL_CMD_LINE_OTYPE_MAPPING }, { "rmaps_base_cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1, &orte_cmd_options.cpus_per_proc, OPAL_CMD_LINE_TYPE_INT, - "Number of cpus to use for each process [default=1]" }, + "Number of cpus to use for each process [default=1]", + OPAL_CMD_LINE_OTYPE_MAPPING }, { "rmaps_base_cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, &orte_cmd_options.cpus_per_proc, OPAL_CMD_LINE_TYPE_INT, - "Synonym for cpus-per-proc" }, + "Synonym for cpus-per-proc", OPAL_CMD_LINE_OTYPE_MAPPING }, /* backward compatiblity */ { "rmaps_base_bycore", '\0', "bycore", "bycore", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by core" }, + "Whether to map and rank processes round-robin by core", + OPAL_CMD_LINE_OTYPE_COMPAT }, { "rmaps_base_bynode", '\0', "bynode", "bynode", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by node" }, + "Whether to map and rank processes round-robin by node", + OPAL_CMD_LINE_OTYPE_COMPAT }, { "rmaps_base_byslot", '\0', "byslot", "byslot", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by slot" }, + "Whether to map and rank processes round-robin by slot", + OPAL_CMD_LINE_OTYPE_COMPAT }, /* Nperxxx options that do not require topology and are always * available - included for backwards compatibility */ { "rmaps_ppr_pernode", '\0', "pernode", "pernode", 0, &orte_cmd_options.pernode, OPAL_CMD_LINE_TYPE_BOOL, - "Launch one process per available node" }, + "Launch one process per available node", + OPAL_CMD_LINE_OTYPE_COMPAT }, { "rmaps_ppr_n_pernode", '\0', "npernode", "npernode", 1, - &orte_cmd_options.npernode, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per node on all allocated nodes" }, + &orte_cmd_options.npernode, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes", + OPAL_CMD_LINE_OTYPE_COMPAT }, { "rmaps_ppr_n_pernode", '\0', "N", NULL, 1, - &orte_cmd_options.npernode, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per node on all allocated nodes (synonym for npernode)" }, + &orte_cmd_options.npernode, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes (synonym for 'map-by node')", + OPAL_CMD_LINE_OTYPE_MAPPING }, /* declare hardware threads as independent cpus */ { "hwloc_base_use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Use hardware threads as independent cpus" }, + "Use hardware threads as independent cpus", OPAL_CMD_LINE_OTYPE_MAPPING }, /* include npersocket for backwards compatibility */ { "rmaps_ppr_n_persocket", '\0', "npersocket", "npersocket", 1, &orte_cmd_options.npersocket, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per socket on all allocated nodes" }, + "Launch n processes per socket on all allocated nodes", + OPAL_CMD_LINE_OTYPE_COMPAT }, /* Mapping options */ { "rmaps_base_mapping_policy", '\0', NULL, "map-by", 1, &orte_cmd_options.mapping_policy, OPAL_CMD_LINE_TYPE_STRING, - "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]" }, + "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]", + OPAL_CMD_LINE_OTYPE_MAPPING }, /* Ranking options */ { "rmaps_base_ranking_policy", '\0', NULL, "rank-by", 1, &orte_cmd_options.ranking_policy, OPAL_CMD_LINE_TYPE_STRING, - "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" }, + "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]", + OPAL_CMD_LINE_OTYPE_RANKING }, /* Binding options */ { "hwloc_base_binding_policy", '\0', NULL, "bind-to", 1, &orte_cmd_options.binding_policy, OPAL_CMD_LINE_TYPE_STRING, - "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported" }, + "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported", OPAL_CMD_LINE_OTYPE_BINDING }, /* backward compatiblity */ { "hwloc_base_bind_to_core", '\0', "bind-to-core", "bind-to-core", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Bind processes to cores" }, + "Bind processes to cores", OPAL_CMD_LINE_OTYPE_COMPAT }, { "hwloc_base_bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Bind processes to sockets" }, + "Bind processes to sockets", OPAL_CMD_LINE_OTYPE_COMPAT }, { "hwloc_base_report_bindings", '\0', "report-bindings", "report-bindings", 0, &orte_cmd_options.report_bindings, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to report process bindings to stderr" }, + "Whether to report process bindings to stderr", + OPAL_CMD_LINE_OTYPE_BINDING }, /* slot list option */ { "hwloc_base_cpu_list", '\0', "cpu-list", "cpu-list", 1, &orte_cmd_options.cpu_list, OPAL_CMD_LINE_TYPE_STRING, - "List of processor IDs to bind processes to [default=NULL]"}, + "List of processor IDs to bind processes to [default=NULL]", + OPAL_CMD_LINE_OTYPE_BINDING }, /* generalized pattern mapping option */ { "rmaps_ppr_pattern", '\0', NULL, "ppr", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of number of processes on a given resource type [default: none]" }, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of number of processes on a given resource type [default: none]", + OPAL_CMD_LINE_OTYPE_MAPPING }, /* Allocation options */ { "orte_display_alloc", '\0', "display-allocation", "display-allocation", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the allocation being used by this job"}, + "Display the allocation being used by this job", OPAL_CMD_LINE_OTYPE_DEBUG }, { "orte_display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, + "Display a detailed list (mostly intended for developers) of the allocation being used by this job", + OPAL_CMD_LINE_OTYPE_DEVEL }, { "hwloc_base_cpu_set", '\0', "cpu-set", "cpu-set", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"}, + "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]", + OPAL_CMD_LINE_OTYPE_DEBUG }, /* mpiexec-like arguments */ { NULL, '\0', "wdir", "wdir", 1, &orte_cmd_options.wdir, OPAL_CMD_LINE_TYPE_STRING, - "Set the working directory of the started processes" }, + "Set the working directory of the started processes", + OPAL_CMD_LINE_OTYPE_LAUNCH }, { NULL, '\0', "wd", "wd", 1, &orte_cmd_options.wdir, OPAL_CMD_LINE_TYPE_STRING, - "Synonym for --wdir" }, + "Synonym for --wdir", OPAL_CMD_LINE_OTYPE_LAUNCH }, { NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0, &orte_cmd_options.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL, - "Set the working directory of the started processes to their session directory" }, + "Set the working directory of the started processes to their session directory", + OPAL_CMD_LINE_OTYPE_LAUNCH }, { NULL, '\0', "path", "path", 1, &orte_cmd_options.path, OPAL_CMD_LINE_TYPE_STRING, - "PATH to be used to look for executables to start processes" }, + "PATH to be used to look for executables to start processes", + OPAL_CMD_LINE_OTYPE_LAUNCH }, /* User-level debugger arguments */ { NULL, '\0', "tv", "tv", 0, &orte_cmd_options.debugger, OPAL_CMD_LINE_TYPE_BOOL, - "Deprecated backwards compatibility flag; synonym for \"--debug\"" }, + "Deprecated backwards compatibility flag; synonym for \"--debug\"", + OPAL_CMD_LINE_OTYPE_DEBUG }, { NULL, '\0', "debug", "debug", 0, &orte_cmd_options.debugger, OPAL_CMD_LINE_TYPE_BOOL, - "Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" }, + "Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter", + OPAL_CMD_LINE_OTYPE_DEBUG }, { "orte_base_user_debugger", '\0', "debugger", "debugger", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Sequence of debuggers to search for when \"--debug\" is used" }, + "Sequence of debuggers to search for when \"--debug\" is used", + OPAL_CMD_LINE_OTYPE_DEBUG }, { "orte_output_debugger_proctable", '\0', "output-proctable", "output-proctable", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Output the debugger proctable after launch" }, + "Output the debugger proctable after launch", + OPAL_CMD_LINE_OTYPE_DEBUG }, /* OpenRTE arguments */ { "orte_debug", 'd', "debug-devel", "debug-devel", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of OpenRTE" }, + "Enable debugging of OpenRTE", OPAL_CMD_LINE_OTYPE_DEVEL }, { "orte_debug_daemons", '\0', "debug-daemons", "debug-daemons", 0, NULL, OPAL_CMD_LINE_TYPE_INT, - "Enable debugging of any OpenRTE daemons used by this application" }, + "Enable debugging of any OpenRTE daemons used by this application", + OPAL_CMD_LINE_OTYPE_DEVEL }, { "orte_debug_daemons_file", '\0', "debug-daemons-file", "debug-daemons-file", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of any OpenRTE daemons used by this application, storing output in files" }, + "Enable debugging of any OpenRTE daemons used by this application, storing output in files", + OPAL_CMD_LINE_OTYPE_DEVEL }, { "orte_leave_session_attached", '\0', "leave-session-attached", "leave-session-attached", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of OpenRTE" }, + "Enable debugging of OpenRTE", OPAL_CMD_LINE_OTYPE_DEBUG }, { "orte_do_not_launch", '\0', "do-not-launch", "do-not-launch", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, + "Perform all necessary operations to prepare to launch the application, but do not actually launch it", + OPAL_CMD_LINE_OTYPE_DEVEL }, { NULL, '\0', NULL, "prefix", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Prefix where Open MPI is installed on remote nodes" }, + "Prefix where Open MPI is installed on remote nodes", + OPAL_CMD_LINE_OTYPE_LAUNCH }, { NULL, '\0', NULL, "noprefix", 0, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Disable automatic --prefix behavior" }, + "Disable automatic --prefix behavior", + OPAL_CMD_LINE_OTYPE_LAUNCH }, { "orte_report_launch_progress", '\0', "show-progress", "show-progress", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Output a brief periodic report on launch progress" }, + "Output a brief periodic report on launch progress", + OPAL_CMD_LINE_OTYPE_LAUNCH }, { "orte_use_regexp", '\0', "use-regexp", "use-regexp", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Use regular expressions for launch" }, + "Use regular expressions for launch", OPAL_CMD_LINE_OTYPE_LAUNCH }, { "orte_report_events", '\0', "report-events", "report-events", 1, NULL, OPAL_CMD_LINE_TYPE_STRING, - "Report events to a tool listening at the specified URI" }, + "Report events to a tool listening at the specified URI", OPAL_CMD_LINE_OTYPE_DEBUG }, { "orte_enable_recovery", '\0', "enable-recovery", "enable-recovery", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable recovery from process failure [Default = disabled]" }, + "Enable recovery from process failure [Default = disabled]", + OPAL_CMD_LINE_OTYPE_UNSUPPORTED }, { "orte_max_restarts", '\0', "max-restarts", "max-restarts", 1, NULL, OPAL_CMD_LINE_TYPE_INT, - "Max number of times to restart a failed process" }, + "Max number of times to restart a failed process", + OPAL_CMD_LINE_OTYPE_UNSUPPORTED }, { NULL, '\0', "continuous", "continuous", 0, &orte_cmd_options.continuous, OPAL_CMD_LINE_TYPE_BOOL, - "Job is to run until explicitly terminated" }, + "Job is to run until explicitly terminated", OPAL_CMD_LINE_OTYPE_DEBUG }, #if OPAL_ENABLE_CRDEBUG == 1 { "opal_cr_enable_crdebug", '\0', "crdebug", "crdebug", 0, @@ -427,33 +479,39 @@ static opal_cmd_line_init_t cmd_line_init[] = { { NULL, '\0', "disable-recovery", "disable-recovery", 0, &orte_cmd_options.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL, - "Disable recovery (resets all recovery options to off)" }, + "Disable recovery (resets all recovery options to off)", + OPAL_CMD_LINE_OTYPE_UNSUPPORTED }, { "orte_no_vm", '\0', "novm", "novm", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Execute without creating an allocation-spanning virtual machine (only start daemons on nodes hosting application procs)" }, + "Execute without creating an allocation-spanning virtual machine (only start daemons on nodes hosting application procs)", + OPAL_CMD_LINE_OTYPE_DVM }, { NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0, &orte_cmd_options.run_as_root, OPAL_CMD_LINE_TYPE_BOOL, - "Allow execution as root (STRONGLY DISCOURAGED)" }, + "Allow execution as root (STRONGLY DISCOURAGED)", + OPAL_CMD_LINE_OTYPE_LAUNCH }, { NULL, '\0', "personality", "personality", 1, &orte_cmd_options.personality, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")" }, + "Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")", + OPAL_CMD_LINE_OTYPE_LAUNCH }, { NULL, '\0', "dvm", "dvm", 0, &orte_cmd_options.create_dvm, OPAL_CMD_LINE_TYPE_BOOL, - "Create a persistent distributed virtual machine (DVM)" }, + "Create a persistent distributed virtual machine (DVM)", + OPAL_CMD_LINE_OTYPE_DVM }, /* tell the dvm to terminate */ { NULL, '\0', "terminate", "terminate", 0, &orte_cmd_options.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL, - "Terminate the DVM" }, + "Terminate the DVM", OPAL_CMD_LINE_OTYPE_DVM }, /* fwd mpirun port */ { "orte_fwd_mpirun_port", '\0', "fwd-mpirun-port", "fwd-mpirun-port", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Forward mpirun port to compute node daemons so all will use it" }, + "Forward mpirun port to compute node daemons so all will use it", + OPAL_CMD_LINE_OTYPE_LAUNCH }, /* End of list */ { NULL, '\0', NULL, NULL, 0, diff --git a/orte/mca/schizo/schizo.h b/orte/mca/schizo/schizo.h index 77b1782fc28..56f47e4e62d 100644 --- a/orte/mca/schizo/schizo.h +++ b/orte/mca/schizo/schizo.h @@ -118,7 +118,7 @@ typedef void (*orte_schizo_base_module_finalize_fn_t)(void); * and decides it cannot provide the info in the current situation, * then it can return ORTE_ERR_TAKE_NEXT_OPTION to indicate that * another module should be tried */ -typedef long (*orte_schizo_base_module_get_rem_time_fn_t)(void); +typedef int (*orte_schizo_base_module_get_rem_time_fn_t)(uint32_t *timeleft); /* * schizo module version 1.3.0 diff --git a/orte/mca/schizo/slurm/schizo_slurm.c b/orte/mca/schizo/slurm/schizo_slurm.c index fbfd4a1f799..3f5bebe6ce9 100644 --- a/orte/mca/schizo/slurm/schizo_slurm.c +++ b/orte/mca/schizo/slurm/schizo_slurm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * $COPYRIGHT$ * @@ -29,10 +29,12 @@ #include "schizo_slurm.h" static orte_schizo_launch_environ_t check_launch_environment(void); +static int get_remaining_time(uint32_t *timeleft); static void finalize(void); orte_schizo_base_module_t orte_schizo_slurm_module = { .check_launch_environment = check_launch_environment, + .get_remaining_time = get_remaining_time, .finalize = finalize }; @@ -123,6 +125,60 @@ static orte_schizo_launch_environ_t check_launch_environment(void) return myenv; } +static int get_remaining_time(uint32_t *timeleft) +{ + char output[256], *cmd, *jobid, **res; + FILE *fp; + uint32_t tleft; + size_t cnt; + + /* set the default */ + *timeleft = UINT32_MAX; + + if (NULL == (jobid = getenv("SLURM_JOBID"))) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } + if (0 > asprintf(&cmd, "squeue -h -j %s -o %%L", jobid)) { + return ORTE_ERR_OUT_OF_RESOURCE; + } + fp = popen(cmd, "r"); + if (NULL == fp) { + free(cmd); + return ORTE_ERR_FILE_OPEN_FAILURE; + } + if (NULL == fgets(output, 256, fp)) { + free(cmd); + pclose(fp); + return ORTE_ERR_FILE_READ_FAILURE; + } + free(cmd); + pclose(fp); + /* the output is returned in a colon-delimited set of fields */ + res = opal_argv_split(output, ':'); + cnt = opal_argv_count(res); + tleft = strtol(res[cnt-1], NULL, 10); // has to be at least one field + /* the next field would be minutes */ + if (1 < cnt) { + tleft += 60 * strtol(res[cnt-2], NULL, 10); + } + /* next field would be hours */ + if (2 < cnt) { + tleft += 3600 * strtol(res[cnt-3], NULL, 10); + } + /* next field is days */ + if (3 < cnt) { + tleft += 24*3600 * strtol(res[cnt-4], NULL, 10); + } + /* if there are more fields than that, then it is infinite */ + if (4 < cnt) { + tleft = UINT32_MAX; + } + opal_argv_free(res); + + *timeleft = tleft; + return ORTE_SUCCESS; +} + static void finalize(void) { int i; diff --git a/orte/mca/schizo/slurm/schizo_slurm_component.c b/orte/mca/schizo/slurm/schizo_slurm_component.c index 32d4bfbead9..180bf9a3c56 100644 --- a/orte/mca/schizo/slurm/schizo_slurm_component.c +++ b/orte/mca/schizo/slurm/schizo_slurm_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,8 +38,8 @@ orte_schizo_base_component_t mca_schizo_slurm_component = { static int component_query(mca_base_module_t **module, int *priority) { - /* disqualify ourselves if we are not an app or under slurm */ - if (!ORTE_PROC_IS_APP) { + /* disqualify ourselves if we are not under slurm */ + if (NULL == getenv("SLURM_JOBID")) { *priority = 0; *module = NULL; return OPAL_ERROR; @@ -49,4 +49,3 @@ static int component_query(mca_base_module_t **module, int *priority) *priority = 50; return ORTE_SUCCESS; } - diff --git a/orte/mca/sensor/base/base.h b/orte/mca/sensor/base/base.h deleted file mode 100644 index c01cf9ed4d6..00000000000 --- a/orte/mca/sensor/base/base.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - */ - -#ifndef MCA_SENSOR_BASE_H -#define MCA_SENSOR_BASE_H - -/* - * includes - */ -#include "orte_config.h" - -#include "opal/class/opal_list.h" -#include "opal/mca/base/base.h" - -#include "orte/mca/sensor/sensor.h" - -BEGIN_C_DECLS - -/* - * MCA Framework - */ -ORTE_DECLSPEC extern mca_base_framework_t orte_sensor_base_framework; -/* select a component */ -ORTE_DECLSPEC int orte_sensor_base_select(void); - - -END_C_DECLS -#endif diff --git a/orte/mca/sensor/base/sensor_base_fns.c b/orte/mca/sensor/base/sensor_base_fns.c deleted file mode 100644 index 81f9bbf69ae..00000000000 --- a/orte/mca/sensor/base/sensor_base_fns.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/dss/dss.h" -#include "opal/mca/event/event.h" - -#include "orte/mca/sensor/base/base.h" -#include "orte/mca/sensor/base/sensor_private.h" - -static bool mods_active = false; - -void orte_sensor_base_start(orte_jobid_t job) -{ - orte_sensor_active_module_t *i_module; - int i; - - if (0 < orte_sensor_base.rate.tv_sec) { - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "%s sensor:base: starting sensors", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* call the start function of all modules in priority order */ - for (i=0; i < orte_sensor_base.modules.size; i++) { - if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) { - continue; - } - mods_active = true; - if (NULL != i_module->module->start) { - i_module->module->start(job); - } - } - - if (mods_active && !orte_sensor_base.active) { - /* setup a buffer to collect samples */ - orte_sensor_base.samples = OBJ_NEW(opal_buffer_t); - /* startup a timer to wake us up periodically - * for a data sample - */ - orte_sensor_base.active = true; - opal_event_evtimer_set(orte_event_base, &orte_sensor_base.sample_ev, - orte_sensor_base_sample, NULL); - opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate); - } - } - return; -} - -void orte_sensor_base_stop(orte_jobid_t job) -{ - orte_sensor_active_module_t *i_module; - int i; - - if (!mods_active) { - return; - } - - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "%s sensor:base: stopping sensors", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - if (orte_sensor_base.active) { - opal_event_del(&orte_sensor_base.sample_ev); - orte_sensor_base.active = false; - } - - /* call the stop function of all modules in priority order */ - for (i=0; i < orte_sensor_base.modules.size; i++) { - if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) { - continue; - } - if (NULL != i_module->module->stop) { - i_module->module->stop(job); - } - } - - return; -} - -void orte_sensor_base_sample(int fd, short args, void *cbdata) -{ - orte_sensor_active_module_t *i_module; - int i; - - if (!mods_active) { - return; - } - - /* see if we were ordered to stop */ - if (!orte_sensor_base.active) { - return; - } - - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "%s sensor:base: sampling sensors", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* call the sample function of all modules in priority order from - * highest to lowest - the heartbeat should always be the lowest - * priority, so it will send any collected data - */ - for (i=0; i < orte_sensor_base.modules.size; i++) { - if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) { - continue; - } - if (NULL != i_module->module->sample) { - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "%s sensor:base: sampling component %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - i_module->component->base_version.mca_component_name); - i_module->module->sample(); - } - } - - /* restart the timer */ - opal_event_evtimer_add(&orte_sensor_base.sample_ev, &orte_sensor_base.rate); - - return; -} - -void orte_sensor_base_log(char *comp, opal_buffer_t *data) -{ - int i; - orte_sensor_active_module_t *i_module; - - if (NULL == comp) { - /* nothing we can do */ - return; - } - - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "%s sensor:base: logging sensor %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), comp); - - /* find the specified module */ - for (i=0; i < orte_sensor_base.modules.size; i++) { - if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) { - continue; - } - if (0 == strcmp(comp, i_module->component->base_version.mca_component_name)) { - if (NULL != i_module->module->log) { - i_module->module->log(data); - } - return; - } - } -} diff --git a/orte/mca/sensor/base/sensor_base_frame.c b/orte/mca/sensor/base/sensor_base_frame.c deleted file mode 100644 index 73c6cdf79cc..00000000000 --- a/orte/mca/sensor/base/sensor_base_frame.c +++ /dev/null @@ -1,133 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * Copyright (c) 2017 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/mca.h" -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/mca/base/base.h" -#include "opal/class/opal_pointer_array.h" - -#ifdef HAVE_STRING_H -#include -#endif - -#include "orte/mca/sensor/base/base.h" -#include "orte/mca/sensor/base/sensor_private.h" - -/* - * The following file was created by configure. It contains extern - * statements and the definition of an array of pointers to each - * component's public mca_base_component_t struct. - */ - -#include "orte/mca/sensor/base/static-components.h" - -/* - * Global variables - */ -orte_sensor_base_API_module_t orte_sensor = { - orte_sensor_base_start, - orte_sensor_base_stop -}; -orte_sensor_base_t orte_sensor_base = {{{0}}}; - -/* - * Local variables - */ -static int orte_sensor_base_sample_rate = 0; - -static int orte_sensor_base_register(mca_base_register_flag_t flags) -{ - int var_id; - - orte_sensor_base_sample_rate = 0; - var_id = mca_base_var_register("orte", "sensor", "base", "sample_rate", - "Sample rate in seconds", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &orte_sensor_base_sample_rate); - mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "sample_rate", - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - - /* see if we want samples logged */ - orte_sensor_base.log_samples = false; - var_id = mca_base_var_register("orte", "sensor", "base", "log_samples", - "Log samples to database", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &orte_sensor_base.log_samples); - mca_base_var_register_synonym(var_id, "orte", "sensor", NULL, "log_samples", - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - - return ORTE_SUCCESS; -} - -static int orte_sensor_base_close(void) -{ - orte_sensor_active_module_t *i_module; - int i; - - for (i=0; i < orte_sensor_base.modules.size; i++) { - if (NULL == (i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i))) { - continue; - } - if (NULL != i_module->module->finalize) { - i_module->module->finalize(); - } - } - OBJ_DESTRUCT(&orte_sensor_base.modules); - - /* Close all remaining available components */ - return mca_base_framework_components_close(&orte_sensor_base_framework, NULL); -} - -/** - * Function for finding and opening either all MCA components, or the one - * that was specifically requested via a MCA parameter. - */ -static int orte_sensor_base_open(mca_base_open_flag_t flags) -{ - /* initialize globals */ - orte_sensor_base.active = false; - - /* construct the array of modules */ - OBJ_CONSTRUCT(&orte_sensor_base.modules, opal_pointer_array_t); - opal_pointer_array_init(&orte_sensor_base.modules, 3, INT_MAX, 1); - - /* get the sample rate */ - orte_sensor_base.rate.tv_sec = orte_sensor_base_sample_rate; - orte_sensor_base.rate.tv_usec = 0; - - /* Open up all available components */ - return mca_base_framework_components_open(&orte_sensor_base_framework, flags); -} - -MCA_BASE_FRAMEWORK_DECLARE(orte, sensor, "ORTE Monitoring Sensors", - orte_sensor_base_register, - orte_sensor_base_open, orte_sensor_base_close, - mca_sensor_base_static_components, 0); - -static void cons(orte_sensor_active_module_t *t) -{ - t->sampling = true; -} -OBJ_CLASS_INSTANCE(orte_sensor_active_module_t, - opal_object_t, - cons, NULL); diff --git a/orte/mca/sensor/base/sensor_base_select.c b/orte/mca/sensor/base/sensor_base_select.c deleted file mode 100644 index 353414b7eef..00000000000 --- a/orte/mca/sensor/base/sensor_base_select.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "orte_config.h" -#ifdef HAVE_STRING_H -#include -#endif - -#include "orte/constants.h" - -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/sensor/base/base.h" -#include "orte/mca/sensor/base/sensor_private.h" - - -static bool selected = false; - -/** - * Function for weeding out sensor components that don't want to run. - * - * Call the init function on all available components to find out if - * they want to run. Select all components that don't fail. Failing - * components will be closed and unloaded. The selected modules will - * be returned to the caller in a opal_list_t. - */ -int orte_sensor_base_select(void) -{ - mca_base_component_list_item_t *cli = NULL; - orte_sensor_base_component_t *component = NULL; - mca_base_module_t *module = NULL; - orte_sensor_active_module_t *i_module; - int priority = 0, i, j, low_i; - opal_pointer_array_t tmp_array; - bool none_found; - orte_sensor_active_module_t *tmp_module = NULL, *tmp_module_sw = NULL; - bool duplicate; - - if (selected) { - return ORTE_SUCCESS; - } - selected = true; - - OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t); - - opal_output_verbose(10, orte_sensor_base_framework.framework_output, - "sensor:base:select: Auto-selecting components"); - - /* - * Traverse the list of available components. - * For each call their 'query' functions to determine relative priority. - */ - none_found = true; - OPAL_LIST_FOREACH(cli, &orte_sensor_base_framework.framework_components, mca_base_component_list_item_t) { - component = (orte_sensor_base_component_t *) cli->cli_component; - - /* - * If there is a query function then use it. - */ - if (NULL == component->base_version.mca_query_component) { - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "sensor:base:select Skipping component [%s]. It does not implement a query function", - component->base_version.mca_component_name ); - continue; - } - - /* - * Query this component for the module and priority - */ - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "sensor:base:select Querying component [%s]", - component->base_version.mca_component_name); - - component->base_version.mca_query_component(&module, &priority); - - /* - * If no module was returned or negative priority, then skip component - */ - if (NULL == module || priority < 0) { - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "sensor:base:select Skipping component [%s]. Query failed to return a module", - component->base_version.mca_component_name ); - continue; - } - - /* check to see if we already have someone who senses the - * same things - if so, take the higher priority one - */ - duplicate = false; - for (i=0; i < tmp_array.size; i++) { - tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i); - if (NULL == tmp_module) { - continue; - } - if (0 == strcmp(component->data_measured, tmp_module->component->data_measured)) { - if (tmp_module->priority < priority) { - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "sensor:base:select Replacing component %s with %s - both measure %s", - tmp_module->component->base_version.mca_component_name, - component->base_version.mca_component_name, - component->data_measured); - OBJ_RELEASE(tmp_module); - opal_pointer_array_set_item(&tmp_array, i, NULL); - break; - } else { - duplicate = true; - } - } - } - if (duplicate) { - /* ignore this component */ - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "sensor:base:select Ignoring component %s - duplicate with higher priority measures %s", - component->base_version.mca_component_name, - component->data_measured); - continue; - } - - /* - * Append them to the temporary list, we will sort later - */ - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "sensor:base:select Query of component [%s] set priority to %d", - component->base_version.mca_component_name, priority); - tmp_module = OBJ_NEW(orte_sensor_active_module_t); - tmp_module->component = component; - tmp_module->module = (orte_sensor_base_module_t*)module; - tmp_module->priority = priority; - - opal_pointer_array_add(&tmp_array, (void*)tmp_module); - none_found = false; - } - - if (none_found) { - /* okay for no modules to be found */ - return ORTE_SUCCESS; - } - - /* - * Sort the list by decending priority - */ - priority = 0; - for(j = 0; j < tmp_array.size; ++j) { - tmp_module_sw = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, j); - if( NULL == tmp_module_sw ) { - continue; - } - - low_i = -1; - priority = tmp_module_sw->priority; - - for(i = 0; i < tmp_array.size; ++i) { - tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, i); - if( NULL == tmp_module ) { - continue; - } - if( tmp_module->priority > priority ) { - low_i = i; - priority = tmp_module->priority; - } - } - - if( low_i >= 0 ) { - tmp_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&tmp_array, low_i); - opal_pointer_array_set_item(&tmp_array, low_i, NULL); - j--; /* Try this entry again, if it is not the lowest */ - } else { - tmp_module = tmp_module_sw; - opal_pointer_array_set_item(&tmp_array, j, NULL); - } - opal_output_verbose(5, orte_sensor_base_framework.framework_output, - "sensor:base:select Add module with priority [%s] %d", - tmp_module->component->base_version.mca_component_name, tmp_module->priority); - opal_pointer_array_add(&orte_sensor_base.modules, tmp_module); - } - OBJ_DESTRUCT(&tmp_array); - - /* - * Initialize each of the modules in priority order from - * highest to lowest - */ - for(i = 0; i < orte_sensor_base.modules.size; ++i) { - i_module = (orte_sensor_active_module_t*)opal_pointer_array_get_item(&orte_sensor_base.modules, i); - if( NULL == i_module ) { - continue; - } - if( NULL != i_module->module->init ) { - if (ORTE_SUCCESS != i_module->module->init()) { - /* can't sample - however, if we are the HNP, - * then we need this module - * anyway so we can log incoming data - */ - if (ORTE_PROC_IS_HNP) { - i_module->sampling = false; - } else { - opal_pointer_array_set_item(&orte_sensor_base.modules, i, NULL); - OBJ_RELEASE(i_module); - } - } - } - } - - return ORTE_SUCCESS; -} diff --git a/orte/mca/sensor/base/sensor_private.h b/orte/mca/sensor/base/sensor_private.h deleted file mode 100644 index 3178b05bf5a..00000000000 --- a/orte/mca/sensor/base/sensor_private.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - */ - -#ifndef MCA_SENSOR_PRIVATE_H -#define MCA_SENSOR_PRIVATE_H - -/* - * includes - */ -#include "orte_config.h" - -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ - -#include "opal/class/opal_pointer_array.h" -#include "opal/mca/event/event.h" - -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/sensor/sensor.h" - - -/* - * Global functions for MCA overall collective open and close - */ -BEGIN_C_DECLS - -/* define a struct to hold framework-global values */ -typedef struct { - opal_pointer_array_t modules; - bool log_samples; - bool active; - struct timeval rate; - opal_event_t sample_ev; - opal_buffer_t *samples; -} orte_sensor_base_t; - -typedef struct { - opal_object_t super; - orte_sensor_base_component_t *component; - orte_sensor_base_module_t *module; - int priority; - bool sampling; -} orte_sensor_active_module_t; -OBJ_CLASS_DECLARATION(orte_sensor_active_module_t); - - -ORTE_DECLSPEC extern orte_sensor_base_t orte_sensor_base; -ORTE_DECLSPEC void orte_sensor_base_start(orte_jobid_t job); -ORTE_DECLSPEC void orte_sensor_base_stop(orte_jobid_t job); -ORTE_DECLSPEC void orte_sensor_base_sample(int fd, short args, void *cbdata); -ORTE_DECLSPEC void orte_sensor_base_log(char *comp, opal_buffer_t *data); - -END_C_DECLS -#endif diff --git a/orte/mca/sensor/file/configure.m4 b/orte/mca/sensor/file/configure.m4 deleted file mode 100644 index 67f19d12e59..00000000000 --- a/orte/mca/sensor/file/configure.m4 +++ /dev/null @@ -1,24 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_sensor_file_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_sensor_file_CONFIG], [ - AC_CONFIG_FILES([orte/mca/sensor/file/Makefile]) - - # if we don't want sensors, don't compile - # this component - AS_IF([test "$orte_want_sensors" = "1"], - [$1], [$2]) -])dnl - diff --git a/orte/mca/sensor/file/sensor_file.c b/orte/mca/sensor/file/sensor_file.c deleted file mode 100644 index 958a6a97a00..00000000000 --- a/orte/mca/sensor/file/sensor_file.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. - * All rights reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#include -#include -#ifdef HAVE_UNISTD_H -#include -#endif -#ifdef HAVE_NETDB_H -#include -#endif -#ifdef HAVE_SYS_PARAM_H -#include -#endif -#include -#include -#include -#ifdef HAVE_TIME_H -#include -#endif -#include -#include - -#include "opal_stdint.h" -#include "opal/util/output.h" - -#include "orte/util/show_help.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/state/state.h" -#include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/sensor/base/base.h" -#include "orte/mca/sensor/base/sensor_private.h" -#include "sensor_file.h" - -/* declare the API functions */ -static int init(void); -static void finalize(void); -static void start(orte_jobid_t job); -static void stop(orte_jobid_t job); -static void file_sample(void); -static void file_log(opal_buffer_t *sample); - -/* instantiate the module */ -orte_sensor_base_module_t orte_sensor_file_module = { - init, - finalize, - start, - stop, - file_sample, - file_log -}; - -/* define a tracking object */ -typedef struct { - opal_list_item_t super; - orte_jobid_t jobid; - orte_vpid_t vpid; - char *file; - int tick; - bool check_size; - bool check_access; - bool check_mod; - int32_t file_size; - time_t last_access; - time_t last_mod; - int limit; -} file_tracker_t; -static void ft_constructor(file_tracker_t *ft) -{ - ft->file = NULL; - ft->tick = 0; - ft->file_size = 0; - ft->last_access = 0; - ft->last_mod = 0; - ft->limit = 0; -} -static void ft_destructor(file_tracker_t *ft) -{ - if (NULL != ft->file) { - free(ft->file); - } -} -OBJ_CLASS_INSTANCE(file_tracker_t, - opal_list_item_t, - ft_constructor, ft_destructor); - -/* local globals */ -static opal_list_t jobs; - -static int init(void) -{ - OBJ_CONSTRUCT(&jobs, opal_list_t); - return ORTE_SUCCESS; -} - -static void finalize(void) -{ - opal_list_item_t *item; - - while (NULL != (item = opal_list_remove_first(&jobs))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&jobs); - - return; -} - -static bool find_value(orte_app_context_t *app, - char *pattern, char **value) -{ - int i; - char *ptr; - - for (i=0; NULL != app->env[i]; i++) { - if (0 == strncmp(app->env[i], pattern, strlen(pattern))) { - ptr = strchr(app->env[i], '='); - ptr++; - if (NULL != value) { - *value = strdup(ptr); - } - return true; - } - } - return false; -} - -/* - * Start monitoring of local processes - */ -static void start(orte_jobid_t jobid) -{ - orte_job_t *jobdat; - orte_app_context_t *app, *aptr; - int i; - char *filename; - file_tracker_t *ft; - char *ptr; - - /* cannot monitor my own job */ - if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s starting file monitoring for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobid))); - - /* get the local jobdat for this job */ - if (NULL == (jobdat = orte_get_job_data_object(jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return; - } - - /* must be at least one app_context, so use the first one found */ - app = NULL; - for (i=0; i < jobdat->apps->size; i++) { - if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, i))) { - app = aptr; - break; - } - } - if (NULL == app) { - /* got a problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return; - } - - /* search the environ to get the filename */ - if (!find_value(app, "OMPI_MCA_sensor_file_filename", &filename)) { - /* was a default file given */ - if (NULL == mca_sensor_file_component.file) { - /* can't do anything without a file */ - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s sensor:file no file for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobid))); - return; - } - filename = mca_sensor_file_component.file; - } - - /* create the tracking object */ - ft = OBJ_NEW(file_tracker_t); - ft->jobid = jobid; - ft->file = strdup(filename); - - /* search the environ to see what we are checking */ - if (!find_value(app, "OMPI_MCA_sensor_file_check_size", &ptr)) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_size) { - ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size); - } - } else { - ft->check_size = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10)); - free(ptr); - } - - if (!find_value(app, "OMPI_MCA_sensor_file_check_access", &ptr)) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_access) { - ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access); - } - } else { - ft->check_access = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10)); - free(ptr); - } - - if (!find_value(app, "OMPI_MCA_sensor_file_check_mod", &ptr)) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_mod) { - ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod); - } - } else { - ft->check_mod = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10)); - free(ptr); - } - - if (!find_value(app, "OMPI_MCA_sensor_file_limit", &ptr)) { - ft->limit = mca_sensor_file_component.limit; - } else { - ft->limit = strtol(ptr, NULL, 10); - free(ptr); - } - opal_list_append(&jobs, &ft->super); - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s file %s monitored for %s%s%s with limit %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ft->file, ft->check_size ? "SIZE:" : " ", - ft->check_access ? "ACCESS TIME:" : " ", - ft->check_mod ? "MOD TIME" : " ", ft->limit)); - return; -} - - -static void stop(orte_jobid_t jobid) -{ - opal_list_item_t *item; - file_tracker_t *ft; - - /* cannot monitor my own job */ - if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { - return; - } - - for (item = opal_list_get_first(&jobs); - item != opal_list_get_end(&jobs); - item = opal_list_get_next(item)) { - ft = (file_tracker_t*)item; - if (jobid == ft->jobid || ORTE_JOBID_WILDCARD == jobid) { - opal_list_remove_item(&jobs, item); - OBJ_RELEASE(item); - } - } - return; -} - -static void file_sample(void) -{ - struct stat buf; - opal_list_item_t *item; - file_tracker_t *ft; - orte_job_t *jdata; - - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s sampling files", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - for (item = opal_list_get_first(&jobs); - item != opal_list_get_end(&jobs); - item = opal_list_get_next(item)) { - ft = (file_tracker_t*)item; - - /* stat the file and get its size */ - if (0 > stat(ft->file, &buf)) { - /* cannot stat file */ - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s could not stat %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ft->file)); - continue; - } - - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s size %lu access %s\tmod %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime))); - - if (ft->check_size) { - if (buf.st_size == ft->file_size) { - ft->tick++; - goto CHECK; - } else { - ft->tick = 0; - ft->file_size = buf.st_size; - } - } - if (ft->check_access) { - if (buf.st_atime == ft->last_access) { - ft->tick++; - goto CHECK; - } else { - ft->tick = 0; - ft->last_access = buf.st_atime; - } - } - if (ft->check_mod) { - if (buf.st_mtime == ft->last_mod) { - ft->tick++; - goto CHECK; - } else { - ft->tick = 0; - ft->last_mod = buf.st_mtime; - } - } - - CHECK: - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s sampled file %s tick %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ft->file, ft->tick)); - - if (ft->tick == ft->limit) { - orte_show_help("help-orte-sensor-file.txt", "file-stalled", true, - ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod)); - jdata = orte_get_job_data_object(ft->jobid); - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED); - } - } -} - -static void file_log(opal_buffer_t *sample) -{ -} diff --git a/orte/mca/sensor/file/sensor_file.h b/orte/mca/sensor/file/sensor_file.h deleted file mode 100644 index d923ee6aa61..00000000000 --- a/orte/mca/sensor/file/sensor_file.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * File movement sensor - */ -#ifndef ORTE_SENSOR_FILE_H -#define ORTE_SENSOR_FILE_H - -#include "orte_config.h" - -#include "orte/mca/sensor/sensor.h" - -BEGIN_C_DECLS - -struct orte_sensor_file_component_t { - orte_sensor_base_component_t super; - int sample_rate; - char *file; - bool check_size; - bool check_access; - bool check_mod; - int limit; -}; -typedef struct orte_sensor_file_component_t orte_sensor_file_component_t; - -ORTE_MODULE_DECLSPEC extern orte_sensor_file_component_t mca_sensor_file_component; -extern orte_sensor_base_module_t orte_sensor_file_module; - - -END_C_DECLS - -#endif diff --git a/orte/mca/sensor/file/sensor_file_component.c b/orte/mca/sensor/file/sensor_file_component.c deleted file mode 100644 index e3b930a59e1..00000000000 --- a/orte/mca/sensor/file/sensor_file_component.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" -#include "opal/util/output.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" - -#include "sensor_file.h" - -/* - * Local functions - */ -static int orte_sensor_file_register (void); -static int orte_sensor_file_open(void); -static int orte_sensor_file_close(void); -static int orte_sensor_file_query(mca_base_module_t **module, int *priority); - -orte_sensor_file_component_t mca_sensor_file_component = { - { - { - ORTE_SENSOR_BASE_VERSION_1_0_0, - - "file", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - orte_sensor_file_open, /* component open */ - orte_sensor_file_close, /* component close */ - orte_sensor_file_query, /* component query */ - orte_sensor_file_register - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - "filemods" // data being sensed - } -}; - - -/** - * component register/open/close/init function - */ -static int orte_sensor_file_register (void) -{ - mca_base_component_t *c = &mca_sensor_file_component.super.base_version; - - /* lookup parameters */ - mca_sensor_file_component.file = NULL; - (void) mca_base_component_var_register (c, "filename", "File to be monitored", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_ALL_EQ, - &mca_sensor_file_component.file); - - mca_sensor_file_component.check_size = false; - (void) mca_base_component_var_register (c, "check_size", "Check the file size", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_ALL_EQ, - &mca_sensor_file_component.check_size); - - mca_sensor_file_component.check_access = false; - (void) mca_base_component_var_register (c, "check_access", "Check access time", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_ALL_EQ, - &mca_sensor_file_component.check_access); - - mca_sensor_file_component.check_mod = false; - (void) mca_base_component_var_register (c, "check_mod", "Check modification time", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_ALL_EQ, - &mca_sensor_file_component.check_mod); - - mca_sensor_file_component.limit = 3; - (void) mca_base_component_var_register (c, "limit", - "Number of times the sensor can detect no motion before declaring error (default=3)", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_ALL_EQ, - &mca_sensor_file_component.limit); - return ORTE_SUCCESS; -} - -static int orte_sensor_file_open(void) -{ - return ORTE_SUCCESS; -} - - -static int orte_sensor_file_query(mca_base_module_t **module, int *priority) -{ - *priority = 20; /* higher than heartbeat */ - *module = (mca_base_module_t *)&orte_sensor_file_module; - return ORTE_SUCCESS; -} - -/** - * Close all subsystems. - */ - -static int orte_sensor_file_close(void) -{ - return ORTE_SUCCESS; -} diff --git a/orte/mca/sensor/ft_tester/Makefile.am b/orte/mca/sensor/ft_tester/Makefile.am deleted file mode 100644 index 83cf1277701..00000000000 --- a/orte/mca/sensor/ft_tester/Makefile.am +++ /dev/null @@ -1,36 +0,0 @@ -# -# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. -# -# Copyright (c) 2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - sensor_ft_tester.c \ - sensor_ft_tester.h \ - sensor_ft_tester_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_sensor_ft_tester_DSO -component_noinst = -component_install = mca_sensor_ft_tester.la -else -component_noinst = libmca_sensor_ft_tester.la -component_install = -endif - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_sensor_ft_tester_la_SOURCES = $(sources) -mca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_sensor_ft_tester_la_SOURCES =$(sources) -libmca_sensor_ft_tester_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/sensor/ft_tester/configure.m4 b/orte/mca/sensor/ft_tester/configure.m4 deleted file mode 100644 index a88d34280c4..00000000000 --- a/orte/mca/sensor/ft_tester/configure.m4 +++ /dev/null @@ -1,24 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_sensor_ft_tester_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_sensor_ft_tester_CONFIG], [ - AC_CONFIG_FILES([orte/mca/sensor/ft_tester/Makefile]) - - # if we don't want sensors, don't compile - # this component - AS_IF([test "$orte_want_sensors" = "1"], - [$1], [$2]) -])dnl - diff --git a/orte/mca/sensor/ft_tester/sensor_ft_tester.h b/orte/mca/sensor/ft_tester/sensor_ft_tester.h deleted file mode 100644 index 241f04d51fc..00000000000 --- a/orte/mca/sensor/ft_tester/sensor_ft_tester.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Process Resource Utilization sensor - */ -#ifndef ORTE_SENSOR_FT_TESTER_H -#define ORTE_SENSOR_FT_TESTER_H - -#include "orte_config.h" - -#include "orte/mca/sensor/sensor.h" -#include "opal/util/alfg.h" - -BEGIN_C_DECLS - -struct orte_sensor_ft_tester_component_t { - orte_sensor_base_component_t super; - float fail_prob; - float daemon_fail_prob; - bool multi_fail; -}; -typedef struct orte_sensor_ft_tester_component_t orte_sensor_ft_tester_component_t; - -ORTE_MODULE_DECLSPEC extern orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component; -extern orte_sensor_base_module_t orte_sensor_ft_tester_module; - -extern opal_rng_buff_t orte_sensor_ft_rng_buff; - -END_C_DECLS - -#endif diff --git a/orte/mca/sensor/ft_tester/sensor_ft_tester_component.c b/orte/mca/sensor/ft_tester/sensor_ft_tester_component.c deleted file mode 100644 index 5f57bdf9056..00000000000 --- a/orte/mca/sensor/ft_tester/sensor_ft_tester_component.c +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" -#include "opal/util/output.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" - -#include "sensor_ft_tester.h" - -/* - * Local functions - */ -static int orte_sensor_ft_tester_register (void); -static int orte_sensor_ft_tester_open(void); -static int orte_sensor_ft_tester_close(void); -static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority); - -orte_sensor_ft_tester_component_t mca_sensor_ft_tester_component = { - { - { - ORTE_SENSOR_BASE_VERSION_1_0_0, - - "ft_tester", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - orte_sensor_ft_tester_open, /* component open */ - orte_sensor_ft_tester_close, /* component close */ - orte_sensor_ft_tester_query, /* component query */ - orte_sensor_ft_tester_register - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - NULL - } -}; - -static char *daemon_fail_prob = NULL; -static char *fail_prob = NULL; -opal_rng_buff_t orte_sensor_ft_rng_buff; - -/** - * component register/open/close/init function - */ -static int orte_sensor_ft_tester_register (void) -{ - mca_base_component_t *c = &mca_sensor_ft_tester_component.super.base_version; - - fail_prob = NULL; - (void) mca_base_component_var_register (c, "fail_prob", "Probability of killing a single executable", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &fail_prob); - - mca_sensor_ft_tester_component.multi_fail = false; - (void) mca_base_component_var_register (c, "multi_allowed", "Allow multiple executables to be killed at one time", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_sensor_ft_tester_component.multi_fail); - - daemon_fail_prob = NULL; - (void) mca_base_component_var_register (c, "daemon_fail_prob", "Probability of killing a daemon", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &daemon_fail_prob); - - return ORTE_SUCCESS; -} - -static int orte_sensor_ft_tester_open(void) -{ - /* lookup parameters */ - if (NULL != fail_prob) { - mca_sensor_ft_tester_component.fail_prob = strtof(fail_prob, NULL); - if (1.0 < mca_sensor_ft_tester_component.fail_prob) { - /* given in percent */ - mca_sensor_ft_tester_component.fail_prob /= 100.0; - } - } else { - mca_sensor_ft_tester_component.fail_prob = 0.0; - } - - if (NULL != daemon_fail_prob) { - mca_sensor_ft_tester_component.daemon_fail_prob = strtof(daemon_fail_prob, NULL); - if (1.0 < mca_sensor_ft_tester_component.daemon_fail_prob) { - /* given in percent */ - mca_sensor_ft_tester_component.daemon_fail_prob /= 100.0; - } - } else { - mca_sensor_ft_tester_component.daemon_fail_prob = 0.0; - } - - return ORTE_SUCCESS; -} - - -static int orte_sensor_ft_tester_query(mca_base_module_t **module, int *priority) -{ - if (0.0 < mca_sensor_ft_tester_component.fail_prob || - 0.0 < mca_sensor_ft_tester_component.daemon_fail_prob) { - *priority = 1; /* at the bottom */ - *module = (mca_base_module_t *)&orte_sensor_ft_tester_module; - /* seed the RNG --- Not sure if we should assume all procs use - * the same seed? - */ - opal_srand(&orte_sensor_ft_rng_buff, (uint32_t) getpid()); - return ORTE_SUCCESS; - } - *priority = 0; - *module = NULL; - return ORTE_ERROR; - -} - -/** - * Close all subsystems. - */ - -static int orte_sensor_ft_tester_close(void) -{ - return ORTE_SUCCESS; -} diff --git a/orte/mca/sensor/heartbeat/Makefile.am b/orte/mca/sensor/heartbeat/Makefile.am deleted file mode 100644 index c6246e666dd..00000000000 --- a/orte/mca/sensor/heartbeat/Makefile.am +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# -# Copyright (c) 2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_ompidata_DATA = help-orte-sensor-heartbeat.txt - -sources = \ - sensor_heartbeat.c \ - sensor_heartbeat.h \ - sensor_heartbeat_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_sensor_heartbeat_DSO -component_noinst = -component_install = mca_sensor_heartbeat.la -else -component_noinst = libmca_sensor_heartbeat.la -component_install = -endif - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_sensor_heartbeat_la_SOURCES = $(sources) -mca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_sensor_heartbeat_la_SOURCES =$(sources) -libmca_sensor_heartbeat_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/sensor/heartbeat/configure.m4 b/orte/mca/sensor/heartbeat/configure.m4 deleted file mode 100644 index ce8daf427a7..00000000000 --- a/orte/mca/sensor/heartbeat/configure.m4 +++ /dev/null @@ -1,24 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_sensor_heartbeat_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_sensor_heartbeat_CONFIG], [ - AC_CONFIG_FILES([orte/mca/sensor/heartbeat/Makefile]) - - # if we don't want sensors, don't compile - # this component - AS_IF([test "$orte_want_sensors" = "1"], - [$1], [$2]) -])dnl - diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.c b/orte/mca/sensor/heartbeat/sensor_heartbeat.c deleted file mode 100644 index f5ceb60d5c6..00000000000 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat.c +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif /* HAVE_STRING_H */ -#include - -#include "opal_stdint.h" -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/mca/event/event.h" - -#include "orte/util/show_help.h" -#include "orte/util/proc_info.h" -#include "orte/util/name_fns.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/state/state.h" -#include "orte/runtime/orte_wait.h" -#include "orte/runtime/orte_globals.h" - -#include "orte/mca/sensor/base/base.h" -#include "orte/mca/sensor/base/sensor_private.h" -#include "sensor_heartbeat.h" - -/* declare the API functions */ -static int init(void); -static void finalize(void); -static void start(orte_jobid_t job); -static void sample(void); - -/* instantiate the module */ -orte_sensor_base_module_t orte_sensor_heartbeat_module = { - init, - finalize, - start, - NULL, - sample, - NULL -}; - -/* declare the local functions */ -static void check_heartbeat(int fd, short event, void *arg); -static void recv_beats(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata); - -/* local globals */ -static orte_job_t *daemons=NULL; -static opal_event_t check_ev; -static bool check_active = false; -static struct timeval check_time; - -static int init(void) -{ - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s initializing heartbeat recvs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* setup to receive heartbeats */ - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_AGGREGATOR) { - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, - ORTE_RML_TAG_HEARTBEAT, - ORTE_RML_PERSISTENT, - recv_beats, NULL); - } - - if (ORTE_PROC_IS_HNP) { - daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - } - - return ORTE_SUCCESS; -} - -static void finalize(void) -{ - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_HEARTBEAT); - if (check_active) { - opal_event_del(&check_ev); - check_active = false; - } - return; -} - -static void start(orte_jobid_t job) -{ - if (!check_active && NULL != daemons) { - /* setup the check event */ - check_time.tv_sec = 3 * orte_sensor_base.rate.tv_sec; - check_time.tv_usec = 0; - opal_event_evtimer_set(orte_event_base, &check_ev, check_heartbeat, &check_ev); - opal_event_evtimer_add(&check_ev, &check_time); - check_active = true; - } -} - -static void sample(void) -{ - opal_buffer_t *buf; - int rc; - orte_process_name_t *tgt; - - /* if we are aborting or shutting down, ignore this */ - if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) { - return; - } - - if (ORTE_PROC_IS_CM) { - /* we send to our daemon */ - tgt = ORTE_PROC_MY_DAEMON; - } else { - tgt = ORTE_PROC_MY_HNP; - } - /* if my target hasn't been defined yet, ignore - nobody listening yet */ - if (ORTE_JOBID_INVALID ==tgt->jobid || - ORTE_VPID_INVALID == tgt->vpid) { - opal_output_verbose(1, orte_sensor_base_framework.framework_output, - "%s sensor:heartbeat: HNP is not defined", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s sending heartbeat", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* if we want sampled data included, point to the bucket */ - buf = OBJ_NEW(opal_buffer_t); - if (orte_sensor_base.log_samples) { - opal_dss.copy_payload(buf, orte_sensor_base.samples); - OBJ_RELEASE(orte_sensor_base.samples); - /* start a new sample bucket */ - orte_sensor_base.samples = OBJ_NEW(opal_buffer_t); - } - - /* send heartbeat */ - if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(tgt, buf, - ORTE_RML_TAG_HEARTBEAT, - orte_rml_send_callback, NULL))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buf); - } -} - -/* this function automatically gets periodically called - * by the event library so we can check on the state - * of the various orteds - */ -static void check_heartbeat(int fd, short dummy, void *arg) -{ - int v; - orte_proc_t *proc; - opal_event_t *tmp = (opal_event_t*)arg; - - OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output, - "%s sensor:check_heartbeat", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* if we are aborting or shutting down, ignore this */ - if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) { - OPAL_OUTPUT_VERBOSE((3, orte_sensor_base_framework.framework_output, - "%s IGNORING CHECK abnorm_term %s fin %s init %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_abnormal_term_ordered ? "TRUE" : "FALSE", - orte_finalizing ? "TRUE" : "FALSE", - orte_initialized ? "TRUE" : "FALSE")); - check_active = false; - return; - } - - for (v=0; v < daemons->procs->size; v++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) { - continue; - } - /* ignore myself */ - if (proc->name.vpid == ORTE_PROC_MY_NAME->vpid) { - continue; - } - if (ORTE_PROC_STATE_RUNNING != proc->state) { - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s sensor:heartbeat DAEMON %s IS NOT RUNNING", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - continue; - } - - if (0 == proc->beat) { - /* no heartbeat recvd in last window */ - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s sensor:check_heartbeat FAILED for daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED); - } else { - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s HEARTBEAT DETECTED FOR %s: NUM BEATS %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), proc->beat)); - } - /* reset for next period */ - proc->beat = 0; - } - - /* reset the timer */ - opal_event_evtimer_add(tmp, &check_time); -} - -static void recv_beats(int status, orte_process_name_t* sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, void *cbdata) -{ - orte_proc_t *proc; - int rc, n; - char *component=NULL; - opal_buffer_t *buf; - - opal_output_verbose(1, orte_sensor_base_framework.framework_output, - "%s received beat from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender)); - - /* if we are aborting or shutting down, ignore this */ - if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) { - return; - } - - /* get this daemon's object */ - if (NULL != daemons) { - if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, sender->vpid))) { - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "%s marked beat from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender))); - proc->beat++; - /* if this daemon has reappeared, reset things */ - if (ORTE_PROC_STATE_HEARTBEAT_FAILED == proc->state) { - proc->state = ORTE_PROC_STATE_RUNNING; - } - } - } - - /* unload any sampled data */ - n=1; - while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &buf, &n, OPAL_BUFFER))) { - if (NULL != buf) { - n=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(buf, &component, &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - break; - } - orte_sensor_base_log(component, buf); - OBJ_RELEASE(buf); - free(component); - n=1; - } - } - if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { - ORTE_ERROR_LOG(rc); - } -} diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.h b/orte/mca/sensor/heartbeat/sensor_heartbeat.h deleted file mode 100644 index 08aad98f2d8..00000000000 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Heartbeat sensor - */ -#ifndef ORTE_SENSOR_HEARTBEAT_H -#define ORTE_SENSOR_HEARTBEAT_H - -#include "orte_config.h" - -#include "orte/mca/sensor/sensor.h" - -BEGIN_C_DECLS - -ORTE_MODULE_DECLSPEC extern orte_sensor_base_component_t mca_sensor_heartbeat_component; -extern orte_sensor_base_module_t orte_sensor_heartbeat_module; - - -END_C_DECLS - -#endif diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c b/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c deleted file mode 100644 index c2b38c67395..00000000000 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat_component.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" -#include "opal/util/output.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" - -#include "sensor_heartbeat.h" - -/* - * Local functions - */ - -static int orte_sensor_heartbeat_open(void); -static int orte_sensor_heartbeat_close(void); -static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority); - -orte_sensor_base_component_t mca_sensor_heartbeat_component = { - { - ORTE_SENSOR_BASE_VERSION_1_0_0, - - "heartbeat", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - orte_sensor_heartbeat_open, /* component open */ - orte_sensor_heartbeat_close, /* component close */ - orte_sensor_heartbeat_query /* component query */ - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - "heartbeat" -}; - - -/** - * component open/close/init function - */ -static int orte_sensor_heartbeat_open(void) -{ - return ORTE_SUCCESS; -} - - -static int orte_sensor_heartbeat_query(mca_base_module_t **module, int *priority) -{ - *priority = 5; /* lower than all other samplers so that their data gets included in heartbeat */ - *module = (mca_base_module_t *)&orte_sensor_heartbeat_module; - return ORTE_SUCCESS; -} - -/** - * Close all subsystems. - */ - -static int orte_sensor_heartbeat_close(void) -{ - return ORTE_SUCCESS; -} diff --git a/orte/mca/sensor/resusage/Makefile.am b/orte/mca/sensor/resusage/Makefile.am deleted file mode 100644 index 8641c157578..00000000000 --- a/orte/mca/sensor/resusage/Makefile.am +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. -# -# Copyright (c) 2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_ompidata_DATA = help-orte-sensor-resusage.txt - -sources = \ - sensor_resusage.c \ - sensor_resusage.h \ - sensor_resusage_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_sensor_resusage_DSO -component_noinst = -component_install = mca_sensor_resusage.la -else -component_noinst = libmca_sensor_resusage.la -component_install = -endif - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_sensor_resusage_la_SOURCES = $(sources) -mca_sensor_resusage_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_sensor_resusage_la_SOURCES =$(sources) -libmca_sensor_resusage_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/sensor/resusage/configure.m4 b/orte/mca/sensor/resusage/configure.m4 deleted file mode 100644 index d53c50b0121..00000000000 --- a/orte/mca/sensor/resusage/configure.m4 +++ /dev/null @@ -1,24 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. -# All rights reserved. -# Copyright (c) 2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_sensor_resusage_CONFIG([action-if-found], [action-if-not-found]) -# ----------------------------------------------------------- -AC_DEFUN([MCA_orte_sensor_resusage_CONFIG], [ - AC_CONFIG_FILES([orte/mca/sensor/resusage/Makefile]) - - # if we don't want sensors, don't compile - # this component - AS_IF([test "$orte_want_sensors" = "1"], - [$1], [$2]) -])dnl - diff --git a/orte/mca/sensor/resusage/help-orte-sensor-resusage.txt b/orte/mca/sensor/resusage/help-orte-sensor-resusage.txt deleted file mode 100644 index 2fa38bf331f..00000000000 --- a/orte/mca/sensor/resusage/help-orte-sensor-resusage.txt +++ /dev/null @@ -1,21 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. -# -# Copyright (c) 2017 Intel, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for the memory usage sensor -# -[mem-limit-exceeded] -A process has exceeded the specified limit on memory usage: - -Node: %s -Process rank: %s -Memory used: %luGbytes -Memory limit: %luGbytes - diff --git a/orte/mca/sensor/resusage/sensor_resusage.c b/orte/mca/sensor/resusage/sensor_resusage.c deleted file mode 100644 index 49d78187c79..00000000000 --- a/orte/mca/sensor/resusage/sensor_resusage.c +++ /dev/null @@ -1,478 +0,0 @@ -/* - * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights - * reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif /* HAVE_STRING_H */ -#include - -#include "opal_stdint.h" -#include "opal/class/opal_pointer_array.h" -#include "opal/class/opal_ring_buffer.h" -#include "opal/dss/dss.h" -#include "opal/util/output.h" -#include "opal/mca/pstat/pstat.h" -#include "opal/mca/db/db.h" - -#include "orte/util/proc_info.h" -#include "orte/util/name_fns.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/odls/odls_types.h" -#include "orte/mca/odls/base/odls_private.h" -#include "orte/mca/rml/rml.h" -#include "orte/mca/state/state.h" -#include "orte/runtime/orte_globals.h" -#include "orte/orted/orted.h" - -#include "orte/mca/sensor/base/base.h" -#include "orte/mca/sensor/base/sensor_private.h" -#include "sensor_resusage.h" - -/* declare the API functions */ -static int init(void); -static void finalize(void); -static void sample(void); -static void res_log(opal_buffer_t *sample); - -/* instantiate the module */ -orte_sensor_base_module_t orte_sensor_resusage_module = { - init, - finalize, - NULL, - NULL, - sample, - res_log -}; - -static bool log_enabled = true; -static orte_node_t *my_node; -static orte_proc_t *my_proc; - -static int init(void) -{ - orte_job_t *jdata; - - /* ensure my_proc and my_node are available on the global arrays */ - if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { - my_proc = OBJ_NEW(orte_proc_t); - my_node = OBJ_NEW(orte_node_t); - } else { - if (NULL == (my_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, ORTE_PROC_MY_NAME->vpid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - if (NULL == (my_node = my_proc->node)) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - /* protect the objects */ - OBJ_RETAIN(my_proc); - OBJ_RETAIN(my_node); - } - - return ORTE_SUCCESS; -} - -static void finalize(void) -{ - if (NULL != my_proc) { - OBJ_RELEASE(my_proc); - } - if (NULL != my_node) { - OBJ_RELEASE(my_node); - } - return; -} - -static void sample(void) -{ - opal_pstats_t *stats, *st; - opal_node_stats_t *nstats, *nst; - int rc, i; - orte_proc_t *child, *hog=NULL; - float in_use, max_mem; - opal_buffer_t buf, *bptr; - char *comp; - - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, - "sample:resusage sampling resource usage")); - - /* setup a buffer for our stats */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - /* pack our name */ - comp = strdup("resusage"); - if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &comp, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return; - } - free(comp); - - /* update stats on ourself and the node */ - stats = OBJ_NEW(opal_pstats_t); - nstats = OBJ_NEW(opal_node_stats_t); - if (ORTE_SUCCESS != (rc = opal_pstat.query(orte_process_info.pid, stats, nstats))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(stats); - OBJ_RELEASE(nstats); - OBJ_DESTRUCT(&buf); - return; - } - - /* the stats framework can't know nodename or rank */ - strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN); - stats->rank = ORTE_PROC_MY_NAME->vpid; - /* locally save the stats */ - if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&my_proc->stats, stats))) { - OBJ_RELEASE(st); - } - if (NULL != (nst = (opal_node_stats_t*)opal_ring_buffer_push(&my_node->stats, nstats))) { - /* release the popped value */ - OBJ_RELEASE(nst); - } - - /* pack them */ - if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return; - } - if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &nstats, 1, OPAL_NODE_STAT))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return; - } - if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return; - } - - /* loop through our children and update their stats */ - if (NULL != orte_local_children) { - for (i=0; i < orte_local_children->size; i++) { - if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { - continue; - } - if (!child->alive) { - continue; - } - if (0 == child->pid) { - /* race condition */ - continue; - } - stats = OBJ_NEW(opal_pstats_t); - if (ORTE_SUCCESS != opal_pstat.query(child->pid, stats, NULL)) { - /* may hit a race condition where the process has - * terminated, so just ignore any error - */ - OBJ_RELEASE(stats); - continue; - } - /* the stats framework can't know nodename or rank */ - strncpy(stats->node, orte_process_info.nodename, OPAL_PSTAT_MAX_STRING_LEN); - stats->rank = child->name.vpid; - /* store it */ - if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) { - OBJ_RELEASE(st); - } - /* pack them */ - if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return; - } - } - } - - /* xfer any data for transmission */ - if (0 < buf.bytes_used) { - bptr = &buf; - if (OPAL_SUCCESS != (rc = opal_dss.pack(orte_sensor_base.samples, &bptr, 1, OPAL_BUFFER))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&buf); - return; - } - } - OBJ_DESTRUCT(&buf); - - /* are there any issues with node-level usage? */ - nst = (opal_node_stats_t*)opal_ring_buffer_poke(&my_node->stats, -1); - if (NULL != nst && 0.0 < mca_sensor_resusage_component.node_memory_limit) { - OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output, - "%s CHECKING NODE MEM", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* compute the percentage of node memory in-use */ - in_use = 1.0 - (nst->free_mem / nst->total_mem); - OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output, - "%s PERCENT USED: %f LIMIT: %f", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - in_use, mca_sensor_resusage_component.node_memory_limit)); - if (mca_sensor_resusage_component.node_memory_limit <= in_use) { - /* loop through our children and find the biggest hog */ - hog = NULL; - max_mem = 0.0; - for (i=0; i < orte_local_children->size; i++) { - if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { - continue; - } - if (!child->alive) { - continue; - } - if (0 == child->pid) { - /* race condition */ - continue; - } - if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) { - continue; - } - OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output, - "%s PROC %s AT VSIZE %f", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&child->name), st->vsize)); - if (max_mem < st->vsize) { - hog = child; - max_mem = st->vsize; - } - } - if (NULL == hog) { - /* if all children dead and we are still too big, - * then we must be the culprit - abort - */ - OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output, - "%s NO CHILD: COMMITTING SUICIDE", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - orte_errmgr.abort(ORTE_ERR_MEM_LIMIT_EXCEEDED, NULL); - } else { - /* report the problem */ - OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output, - "%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&hog->name))); - ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); - } - /* since we have ordered someone to die, we've done enough for this - * time around - don't check proc limits as well - */ - return; - } - } - - /* check proc limits */ - if (0.0 < mca_sensor_resusage_component.proc_memory_limit) { - OPAL_OUTPUT_VERBOSE((2, orte_sensor_base_framework.framework_output, - "%s CHECKING PROC MEM", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* check my children first */ - for (i=0; i < orte_local_children->size; i++) { - if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { - continue; - } - if (!child->alive) { - continue; - } - if (0 == child->pid) { - /* race condition */ - continue; - } - if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) { - continue; - } - OPAL_OUTPUT_VERBOSE((5, orte_sensor_base_framework.framework_output, - "%s PROC %s AT VSIZE %f", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&child->name), st->vsize)); - if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) { - /* report the problem */ - ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); - } - } - } -} - -static void res_log(opal_buffer_t *sample) -{ - opal_pstats_t *st=NULL; - opal_node_stats_t *nst=NULL; - int rc, n, i; - opal_value_t kv[14]; - char *node; - - if (!log_enabled) { - return; - } - - /* unpack the node name */ - n=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &node, &n, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - return; - } - - /* unpack the node stats */ - n=1; - if (OPAL_SUCCESS != (rc = opal_dss.unpack(sample, &nst, &n, OPAL_NODE_STAT))) { - ORTE_ERROR_LOG(rc); - return; - } - - if (mca_sensor_resusage_component.log_node_stats) { - /* convert this into an array of opal_value_t's - no clean way - * to do this, so have to just manually map each field - */ - for (i=0; i < 13; i++) { - OBJ_CONSTRUCT(&kv[i], opal_value_t); - } - i=0; - kv[i].key = strdup("ctime"); - kv[i].type = OPAL_TIMEVAL; - kv[i].data.tv.tv_sec = nst->sample_time.tv_sec; - kv[i++].data.tv.tv_usec = nst->sample_time.tv_usec; - - kv[i].key = "hostname"; - kv[i].type = OPAL_STRING; - kv[i++].data.string = strdup(node); - - kv[i].key = strdup("total_mem"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->total_mem; - - kv[i].key = strdup("free_mem"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->free_mem; - - kv[i].key = strdup("buffers"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->buffers; - - kv[i].key = strdup("cached"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->cached; - - kv[i].key = strdup("swap_total"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->swap_total; - - kv[i].key = strdup("swap_free"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->swap_free; - - kv[i].key = strdup("mapped"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->mapped; - - kv[i].key = strdup("swap_cached"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->swap_cached; - - kv[i].key = strdup("la"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->la; - - kv[i].key = strdup("la5"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->la5; - - kv[i].key = strdup("la15"); - kv[i].type = OPAL_FLOAT; - kv[i++].data.fval = nst->la15; - - /* store it */ - if (ORTE_SUCCESS != (rc = opal_db.add_log("nodestats", kv, 12))) { - /* don't bark about it - just quietly disable the log */ - log_enabled = false; - } - for (i=0; i < 12; i++) { - OBJ_DESTRUCT(&kv[i]); - } - } - - OBJ_RELEASE(nst); - - if (mca_sensor_resusage_component.log_process_stats) { - /* unpack all process stats */ - n=1; - while (OPAL_SUCCESS == (rc = opal_dss.unpack(sample, &st, &n, OPAL_PSTAT))) { - for (i=0; i < 14; i++) { - OBJ_CONSTRUCT(&kv[i], opal_value_t); - } - kv[0].key = strdup("node"); - kv[0].type = OPAL_STRING; - kv[0].data.string = strdup(st->node); - kv[1].key = strdup("rank"); - kv[1].type = OPAL_INT32; - kv[1].data.int32 = st->rank; - kv[2].key = strdup("pid"); - kv[2].type = OPAL_PID; - kv[2].data.pid = st->pid; - kv[3].key = strdup("cmd"); - kv[3].type = OPAL_STRING; - kv[3].data.string = strdup(st->cmd); - kv[4].key = strdup("state"); - kv[4].type = OPAL_STRING; - kv[4].data.string = (char*)malloc(3 * sizeof(char)); - kv[4].data.string[0] = st->state[0]; - kv[4].data.string[1] = st->state[1]; - kv[4].data.string[2] = '\0'; - kv[5].key = strdup("time"); - kv[5].type = OPAL_TIMEVAL; - kv[5].data.tv.tv_sec = st->time.tv_sec; - kv[5].data.tv.tv_usec = st->time.tv_usec; - kv[6].key = strdup("percent_cpu"); - kv[6].type = OPAL_FLOAT; - kv[6].data.fval = st->percent_cpu; - kv[7].key = strdup("priority"); - kv[7].type = OPAL_INT32; - kv[7].data.int32 = st->priority; - kv[8].key = strdup("num_threads"); - kv[8].type = OPAL_INT16; - kv[8].data.int16 = st->num_threads; - kv[9].key = strdup("vsize"); - kv[9].type = OPAL_FLOAT; - kv[9].data.fval = st->vsize; - kv[10].key = strdup("rss"); - kv[10].type = OPAL_FLOAT; - kv[10].data.fval = st->rss; - kv[11].key = strdup("peak_vsize"); - kv[11].type = OPAL_FLOAT; - kv[11].data.fval = st->peak_vsize; - kv[12].key = strdup("processor"); - kv[12].type = OPAL_INT16; - kv[12].data.int16 = st->processor; - kv[13].key = strdup("sample_time"); - kv[13].type = OPAL_TIMEVAL; - kv[13].data.tv.tv_sec = st->sample_time.tv_sec; - kv[13].data.tv.tv_usec = st->sample_time.tv_usec; - /* store it */ - if (ORTE_SUCCESS != (rc = opal_db.add_log("procstats", kv, 14))) { - log_enabled = false; - } - for (i=0; i < 14; i++) { - OBJ_DESTRUCT(&kv[i]); - } - OBJ_RELEASE(st); - n=1; - } - if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { - ORTE_ERROR_LOG(rc); - } - } -} diff --git a/orte/mca/sensor/resusage/sensor_resusage.h b/orte/mca/sensor/resusage/sensor_resusage.h deleted file mode 100644 index 83f326089f6..00000000000 --- a/orte/mca/sensor/resusage/sensor_resusage.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Process Resource Utilization sensor - */ -#ifndef ORTE_SENSOR_RESUSAGE_H -#define ORTE_SENSOR_RESUSAGE_H - -#include "orte_config.h" - -#include "orte/mca/sensor/sensor.h" - -BEGIN_C_DECLS - -struct orte_sensor_resusage_component_t { - orte_sensor_base_component_t super; - int sample_rate; - float node_memory_limit; - float proc_memory_limit; - bool log_node_stats; - bool log_process_stats; -}; -typedef struct orte_sensor_resusage_component_t orte_sensor_resusage_component_t; - -ORTE_MODULE_DECLSPEC extern orte_sensor_resusage_component_t mca_sensor_resusage_component; -extern orte_sensor_base_module_t orte_sensor_resusage_module; - - -END_C_DECLS - -#endif diff --git a/orte/mca/sensor/resusage/sensor_resusage_component.c b/orte/mca/sensor/resusage/sensor_resusage_component.c deleted file mode 100644 index 2d9aafcaed0..00000000000 --- a/orte/mca/sensor/resusage/sensor_resusage_component.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" -#include "opal/util/output.h" -#include "opal/class/opal_pointer_array.h" - -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" - -#include "sensor_resusage.h" - -/* - * Local functions - */ -static int orte_sensor_resusage_register (void); -static int orte_sensor_resusage_open(void); -static int orte_sensor_resusage_close(void); -static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority); - -orte_sensor_resusage_component_t mca_sensor_resusage_component = { - { - { - ORTE_SENSOR_BASE_VERSION_1_0_0, - - "resusage", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - orte_sensor_resusage_open, /* component open */ - orte_sensor_resusage_close, /* component close */ - orte_sensor_resusage_query, /* component query */ - orte_sensor_resusage_register - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - "procresource,noderesource" - } -}; - -static int node_memory_limit; -static int proc_memory_limit; - -/** - * component open/close/init function - */ -static int orte_sensor_resusage_register (void) -{ - mca_base_component_t *c = &mca_sensor_resusage_component.super.base_version; - - mca_sensor_resusage_component.sample_rate = 0; - (void) mca_base_component_var_register (c, "sample_rate", "Sample rate in seconds (default: 0)", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_sensor_resusage_component.sample_rate); - if (mca_sensor_resusage_component.sample_rate < 0) { - opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate); - return ORTE_ERR_BAD_PARAM; - } - - node_memory_limit = 0; - (void) mca_base_component_var_register (c, "node_memory_limit", - "Percentage of total memory that can be in-use", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &node_memory_limit); - mca_sensor_resusage_component.node_memory_limit = (float)node_memory_limit/100.0; - - proc_memory_limit = 0; - (void) mca_base_component_var_register (c, "proc_memory_limit", - "Max virtual memory size in MBytes", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &proc_memory_limit); - mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit; - - mca_sensor_resusage_component.log_node_stats = false; - (void) mca_base_component_var_register (c, "log_node_stats", "Log the node stats", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_sensor_resusage_component.log_node_stats); - - mca_sensor_resusage_component.log_process_stats = false; - (void) mca_base_component_var_register (c, "log_process_stats", "Log the process stats", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_sensor_resusage_component.log_process_stats); - - return ORTE_SUCCESS; -} - -static int orte_sensor_resusage_open(void) -{ - if (mca_sensor_resusage_component.sample_rate < 0) { - opal_output(0, "Illegal value %d - must be > 0", mca_sensor_resusage_component.sample_rate); - return ORTE_ERR_FATAL; - } - - mca_sensor_resusage_component.node_memory_limit = (float) node_memory_limit/100.0; - mca_sensor_resusage_component.proc_memory_limit = (float) proc_memory_limit; - - return ORTE_SUCCESS; -} - - -static int orte_sensor_resusage_query(mca_base_module_t **module, int *priority) -{ - *priority = 100; /* ahead of heartbeat */ - *module = (mca_base_module_t *)&orte_sensor_resusage_module; - - return ORTE_SUCCESS; -} - -/** - * Close all subsystems. - */ - -static int orte_sensor_resusage_close(void) -{ - return ORTE_SUCCESS; -} diff --git a/orte/mca/sensor/sensor.h b/orte/mca/sensor/sensor.h deleted file mode 100644 index e22852c4386..00000000000 --- a/orte/mca/sensor/sensor.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * @file: - * - */ - -#ifndef MCA_SENSOR_H -#define MCA_SENSOR_H - -/* - * includes - */ - -#include "orte_config.h" -#include "orte/types.h" - -#include "opal/mca/mca.h" - -BEGIN_C_DECLS - -/* - * Component functions - all MUST be provided! - */ - -/* start collecting data */ -typedef void (*orte_sensor_API_module_start_fn_t)(orte_jobid_t job); - -/* stop collecting data */ -typedef void (*orte_sensor_API_module_stop_fn_t)(orte_jobid_t job); - -/* API module */ -/* - * Ver 1.0 - */ -struct orte_sensor_base_API_module_1_0_0_t { - orte_sensor_API_module_start_fn_t start; - orte_sensor_API_module_stop_fn_t stop; -}; - -typedef struct orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_1_0_0_t; -typedef orte_sensor_base_API_module_1_0_0_t orte_sensor_base_API_module_t; - -/* initialize the module */ -typedef int (*orte_sensor_base_module_init_fn_t)(void); - -/* finalize the module */ -typedef void (*orte_sensor_base_module_finalize_fn_t)(void); - -/* tell the module to sample its sensor */ -typedef void (*orte_sensor_base_module_sample_fn_t)(void); - -/* pass a buffer to the module for logging */ -typedef void (*orte_sensor_base_module_log_fn_t)(opal_buffer_t *sample); - -/* - * Component modules Ver 1.0 - */ -struct orte_sensor_base_module_1_0_0_t { - orte_sensor_base_module_init_fn_t init; - orte_sensor_base_module_finalize_fn_t finalize; - orte_sensor_API_module_start_fn_t start; - orte_sensor_API_module_stop_fn_t stop; - orte_sensor_base_module_sample_fn_t sample; - orte_sensor_base_module_log_fn_t log; -}; - -typedef struct orte_sensor_base_module_1_0_0_t orte_sensor_base_module_1_0_0_t; -typedef orte_sensor_base_module_1_0_0_t orte_sensor_base_module_t; - -/* - * the standard component data structure - */ -struct orte_sensor_base_component_1_0_0_t { - mca_base_component_t base_version; - mca_base_component_data_t base_data; - char *data_measured; -}; -typedef struct orte_sensor_base_component_1_0_0_t orte_sensor_base_component_1_0_0_t; -typedef orte_sensor_base_component_1_0_0_t orte_sensor_base_component_t; - - - -/* - * Macro for use in components that are of type sensor v1.0.0 - */ -#define ORTE_SENSOR_BASE_VERSION_1_0_0 \ - /* sensor v1.0 is chained to MCA v2.0 */ \ - MCA_BASE_VERSION_2_0_0, \ - /* sensor v1.0 */ \ - "sensor", 1, 0, 0 - -/* Global structure for accessing sensor functions - */ -ORTE_DECLSPEC extern orte_sensor_base_API_module_t orte_sensor; /* holds API function pointers */ - -END_C_DECLS - -#endif /* MCA_SENSOR_H */ diff --git a/orte/mca/sensor/sensor_types.h b/orte/mca/sensor/sensor_types.h deleted file mode 100644 index 8d27fb2a20e..00000000000 --- a/orte/mca/sensor/sensor_types.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2017 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - */ - -#ifndef ORTE_MCA_SENSOR_TYPES_H -#define ORTE_MCA_SENSOR_TYPES_H - -#include "orte_config.h" -#include "orte/constants.h" - -#ifdef HAVE_SYS_TIME_H -#include -#endif /* HAVE_SYS_TIME_H */ - -#include "opal/dss/dss_types.h" - -/* - * General SENSOR types - instanced in runtime/orte_globals.c - */ - -BEGIN_C_DECLS - -enum { - ORTE_SENSOR_SCALE_LINEAR, - ORTE_SENSOR_SCALE_LOG, - ORTE_SENSOR_SCALE_SIGMOID -}; - -/* - * Structure for passing data from sensors - */ -typedef struct { - opal_object_t super; - char *sensor; - struct timeval timestamp; - opal_byte_object_t data; -} orte_sensor_data_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sensor_data_t); - -END_C_DECLS - -#endif diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index ffa85936196..69cfa8945a8 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -460,6 +460,7 @@ void orte_state_base_report_progress(int fd, short argc, void *cbdata) } static void _send_notification(int status, + orte_proc_state_t state, orte_process_name_t *proc, orte_process_name_t *target) { @@ -485,19 +486,43 @@ static void _send_notification(int status, return; } - /* the source is me */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { + /* the source is the proc */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } - /* we are going to pass three opal_value_t's */ - rc = 3; - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buf); - return; + if (OPAL_ERR_PROC_ABORTED == status) { + /* we will pass four opal_value_t's */ + rc = 4; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } + /* pass along the affected proc(s) */ + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC); + kv.type = OPAL_NAME; + kv.data.name.jobid = proc->jobid; + kv.data.name.vpid = proc->vpid; + kvptr = &kv; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&kv); + OBJ_RELEASE(buf); + return; + } + OBJ_DESTRUCT(&kv); + } else { + /* we are going to pass three opal_value_t's */ + rc = 3; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } } /* pass along the affected proc(s) */ @@ -699,11 +724,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata) /* notify everyone who asked for it */ target.jobid = jdata->jobid; target.vpid = ORTE_VPID_WILDCARD; - _send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD); + _send_notification(OPAL_ERR_JOB_TERMINATED, pdata->state, &target, ORTE_NAME_WILDCARD); } else { target.jobid = jdata->jobid; target.vpid = ORTE_VPID_WILDCARD; - _send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent); + _send_notification(OPAL_ERR_JOB_TERMINATED, pdata->state, &target, &parent); } } } else if (ORTE_PROC_STATE_TERMINATED < pdata->state && @@ -711,7 +736,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata) /* if this was an abnormal term, notify the other procs of the termination */ parent.jobid = jdata->jobid; parent.vpid = ORTE_VPID_WILDCARD; - _send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent); + _send_notification(OPAL_ERR_PROC_ABORTED, pdata->state, &pdata->name, &parent); } } diff --git a/orte/mca/state/dvm/state_dvm.c b/orte/mca/state/dvm/state_dvm.c index cb6b5b9fddb..6fcecd26bee 100644 --- a/orte/mca/state/dvm/state_dvm.c +++ b/orte/mca/state/dvm/state_dvm.c @@ -243,6 +243,7 @@ static void vm_ready(int fd, short args, void *cbdata) opal_byte_object_t bo, *boptr; int8_t flag; int32_t numbytes; + char *nidmap; /* if this is my job, then we are done */ if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) { @@ -250,50 +251,65 @@ static void vm_ready(int fd, short args, void *cbdata) * do this here so we don't have to do it for every * job we are going to launch */ buf = OBJ_NEW(opal_buffer_t); - /* pack the "load nidmap" cmd */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buf); - return; + opal_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD); + /* if we couldn't provide the allocation regex on the orted + * cmd line, then we need to provide all the info here */ + if (!orte_nidmap_communicated) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } + orte_nidmap_communicated = true; + } else { + nidmap = NULL; } - /* flag that daemons were launched so we will update the nidmap */ - flag = 1; - opal_dss.pack(buf, &flag, 1, OPAL_INT8); - /* construct a nodemap with everything in it */ - if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(buf))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buf); - return; + opal_dss.pack(buf, &nidmap, 1, OPAL_STRING); + if (NULL != nidmap) { + free(nidmap); } - - if (!orte_static_ports && !orte_fwd_mpirun_port) { - /* pack a flag indicating wiring info is provided */ + /* provide the info on the capabilities of each node */ + if (!orte_node_info_communicated) { flag = 1; opal_dss.pack(buf, &flag, 1, OPAL_INT8); - /* get wireup info for daemons per the selected routing module */ - wireup = OBJ_NEW(opal_buffer_t); - if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, wireup))) { + if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(buf))) { ORTE_ERROR_LOG(rc); - OBJ_RELEASE(wireup); OBJ_RELEASE(buf); return; } - /* put it in a byte object for xmission */ - opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes); - /* pack the byte object - zero-byte objects are fine */ - bo.size = numbytes; - boptr = &bo; - if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &boptr, 1, OPAL_BYTE_OBJECT))) { - ORTE_ERROR_LOG(rc); + orte_node_info_communicated = true; + if (!orte_static_ports && !orte_fwd_mpirun_port) { + /* pack a flag indicating wiring info is provided */ + flag = 1; + opal_dss.pack(buf, &flag, 1, OPAL_INT8); + /* get wireup info for daemons per the selected routing module */ + wireup = OBJ_NEW(opal_buffer_t); + if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, wireup))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(wireup); + OBJ_RELEASE(buf); + return; + } + /* put it in a byte object for xmission */ + opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes); + /* pack the byte object - zero-byte objects are fine */ + bo.size = numbytes; + boptr = &bo; + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &boptr, 1, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(wireup); + OBJ_RELEASE(buf); + return; + } + /* release the data since it has now been copied into our buffer */ + if (NULL != bo.bytes) { + free(bo.bytes); + } OBJ_RELEASE(wireup); - OBJ_RELEASE(buf); - return; - } - /* release the data since it has now been copied into our buffer */ - if (NULL != bo.bytes) { - free(bo.bytes); + } else { + flag = 0; + opal_dss.pack(buf, &flag, 1, OPAL_INT8); } - OBJ_RELEASE(wireup); } else { flag = 0; opal_dss.pack(buf, &flag, 1, OPAL_INT8); @@ -394,7 +410,7 @@ static void check_complete(int fd, short args, void *cbdata) * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ - if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { + if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c index c782d55f990..708d69fca2f 100644 --- a/orte/mca/state/orted/state_orted.c +++ b/orte/mca/state/orted/state_orted.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ @@ -254,7 +254,7 @@ static void track_procs(int fd, short argc, void *cbdata) orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; - int rc, i, j; + int rc, i; orte_plm_cmd_flag_t cmd; char *rtmod; orte_std_cntr_t index; diff --git a/orte/mca/state/state.h b/orte/mca/state/state.h index 4681af2e060..964d563b221 100644 --- a/orte/mca/state/state.h +++ b/orte/mca/state/state.h @@ -2,6 +2,7 @@ /* * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -79,33 +80,25 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_state_base_framework; do { \ orte_job_t *shadow=(j); \ opal_output_verbose(1, orte_state_base_framework.framework_output, \ - "%s ACTIVATE JOB %s STATE %s AT %s:%d", \ + "%s ACTIVATE JOB %s STATE %s AT %s:%d", \ ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ (NULL == shadow) ? "NULL" : \ - ORTE_JOBID_PRINT(shadow->jobid), \ + ORTE_JOBID_PRINT(shadow->jobid), \ orte_job_state_to_str((s)), \ - __FILE__, __LINE__); \ - /* sanity check */ \ - if ((s) < 0) { \ - assert(0); \ - } \ + __FILE__, __LINE__); \ orte_state.activate_job_state(shadow, (s)); \ } while(0); #define ORTE_ACTIVATE_PROC_STATE(p, s) \ do { \ orte_process_name_t *shadow=(p); \ - opal_output_verbose(1, orte_state_base_framework.framework_output, \ - "%s ACTIVATE PROC %s STATE %s AT %s:%d", \ + opal_output_verbose(1, orte_state_base_framework.framework_output, \ + "%s ACTIVATE PROC %s STATE %s AT %s:%d", \ ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ (NULL == shadow) ? "NULL" : \ - ORTE_NAME_PRINT(shadow), \ - orte_proc_state_to_str((s)), \ - __FILE__, __LINE__); \ - /* sanity check */ \ - if ((s) < 0) { \ - assert(0); \ - } \ + ORTE_NAME_PRINT(shadow), \ + orte_proc_state_to_str((s)), \ + __FILE__, __LINE__); \ orte_state.activate_proc_state(shadow, (s)); \ } while(0); diff --git a/orte/mca/sensor/ft_tester/sensor_ft_tester.c b/orte/orted/ft_tester.c similarity index 99% rename from orte/mca/sensor/ft_tester/sensor_ft_tester.c rename to orte/orted/ft_tester.c index 1d7d62090cc..f614c65c9fa 100644 --- a/orte/mca/sensor/ft_tester/sensor_ft_tester.c +++ b/orte/orted/ft_tester.c @@ -1,13 +1,13 @@ /* - * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ diff --git a/orte/orted/help-orted.txt b/orte/orted/help-orted.txt index fb271f90d8c..c89d4e10157 100644 --- a/orte/orted/help-orted.txt +++ b/orte/orted/help-orted.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -60,3 +60,23 @@ info key: key: %s The operation will continue, but may not behave completely as expected. +# +[timedout] +A request has timed out and will therefore fail: + + Operation: %s + +Your job may terminate as a result of this problem. You may want to +adjust the MCA parameter pmix_server_max_wait and try again. +# +[noroom] +A request for an asynchronous runtime operation cannot be fulfilled +because of a lack of room in the tracking array: + + Operation: %s + Number of rooms: %d + +This is usually caused by a large job that encounters significant +delays across the cluster when starting the application processes. +Your job may terminate as a result of this problem. You may want to +adjust the MCA parameter pmix_server_max_reqs and try again. diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index d0eed5c3a7a..4b5b7932c0e 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -15,7 +15,7 @@ * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2016 Research Organization for Information Science + * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -59,6 +59,7 @@ #include "orte/util/session_dir.h" #include "orte/util/name_fns.h" #include "orte/util/nidmap.h" +#include "orte/util/compress.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/grpcomm/base/base.h" @@ -69,6 +70,7 @@ #include "orte/mca/odls/base/base.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/routed/routed.h" #include "orte/mca/ess/ess.h" #include "orte/mca/state/state.h" @@ -100,7 +102,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, int32_t signal; orte_jobid_t job; char *contact_info; - opal_buffer_t *answer; + opal_buffer_t data, *answer; orte_job_t *jdata; orte_process_name_t proc, proc2; orte_process_name_t *return_addr; @@ -122,6 +124,10 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, opal_pstats_t pstat; char *rtmod; char *coprocessors; + orte_job_map_t *map; + int8_t flag; + uint8_t *cmpdata; + size_t cmplen; /* unpack the command */ n = 1; @@ -527,7 +533,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, } break; - /**** TERMINATE JOB COMMAND ****/ case ORTE_DAEMON_TERMINATE_JOB_CMD: @@ -557,25 +562,85 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, } break; + + /**** DVM CLEANUP JOB COMMAND ****/ + case ORTE_DAEMON_DVM_CLEANUP_JOB_CMD: + /* unpack the jobid */ + n = 1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { + ORTE_ERROR_LOG(ret); + goto CLEANUP; + } + + /* look up job data object */ + if (NULL == (jdata = orte_get_job_data_object(job))) { + /* we can safely ignore this request as the job + * was already cleaned up */ + goto CLEANUP; + } + + /* if we have any local children for this job, then we + * can ignore this request as we would have already + * dealt with it */ + if (0 < jdata->num_local_procs) { + goto CLEANUP; + } + + /* release all resources (even those on other nodes) that we + * assigned to this job */ + if (NULL != jdata->map) { + map = (orte_job_map_t*)jdata->map; + for (n = 0; n < map->nodes->size; n++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { + continue; + } + for (i = 0; i < node->procs->size; i++) { + if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + if (proct->name.jobid != jdata->jobid) { + /* skip procs from another job */ + continue; + } + node->slots_inuse--; + node->num_procs--; + /* set the entry in the node array to NULL */ + opal_pointer_array_set_item(node->procs, i, NULL); + /* release the proc once for the map entry */ + OBJ_RELEASE(proct); + } + /* set the node location to NULL */ + opal_pointer_array_set_item(map->nodes, n, NULL); + /* maintain accounting */ + OBJ_RELEASE(node); + /* flag that the node is no longer in a map */ + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + OBJ_RELEASE(map); + jdata->map = NULL; + } + break; + + /**** REPORT TOPOLOGY COMMAND ****/ case ORTE_DAEMON_REPORT_TOPOLOGY_CMD: - answer = OBJ_NEW(opal_buffer_t); + OBJ_CONSTRUCT(&data, opal_buffer_t); /* pack the topology signature */ - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &orte_topo_signature, 1, OPAL_STRING))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &orte_topo_signature, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); + OBJ_DESTRUCT(&data); goto CLEANUP; } /* pack the topology */ - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); + OBJ_DESTRUCT(&data); goto CLEANUP; } /* detect and add any coprocessors */ coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology); - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &coprocessors, 1, OPAL_STRING))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &coprocessors, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } if (NULL != coprocessors) { @@ -583,12 +648,54 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, } /* see if I am on a coprocessor */ coprocessors = opal_hwloc_base_check_on_coprocessor(); - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &coprocessors, 1, OPAL_STRING))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &coprocessors, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } if (NULL!= coprocessors) { free(coprocessors); } + answer = OBJ_NEW(opal_buffer_t); + if (orte_util_compress_block((uint8_t*)data.base_ptr, data.bytes_used, + &cmpdata, &cmplen)) { + /* the data was compressed - mark that we compressed it */ + flag = 1; + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &flag, 1, OPAL_INT8))) { + ORTE_ERROR_LOG(ret); + free(cmpdata); + OBJ_DESTRUCT(&data); + } + /* pack the compressed length */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &cmplen, 1, OPAL_SIZE))) { + ORTE_ERROR_LOG(ret); + free(cmpdata); + OBJ_DESTRUCT(&data); + } + /* pack the uncompressed length */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &data.bytes_used, 1, OPAL_SIZE))) { + ORTE_ERROR_LOG(ret); + free(cmpdata); + OBJ_DESTRUCT(&data); + } + /* pack the compressed info */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, cmpdata, cmplen, OPAL_UINT8))) { + ORTE_ERROR_LOG(ret); + free(cmpdata); + OBJ_DESTRUCT(&data); + } + OBJ_DESTRUCT(&data); + free(cmpdata); + } else { + /* mark that it was not compressed */ + flag = 0; + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &flag, 1, OPAL_INT8))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&data); + free(cmpdata); + } + /* transfer the payload across */ + opal_dss.copy_payload(answer, &data); + OBJ_DESTRUCT(&data); + } /* send the data */ if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, sender, answer, ORTE_RML_TAG_TOPOLOGY_REPORT, @@ -1337,6 +1444,9 @@ static char *get_orted_comm_cmd_str(int command) case ORTE_DAEMON_GET_MEMPROFILE: return strdup("ORTE_DAEMON_GET_MEMPROFILE"); + case ORTE_DAEMON_DVM_CLEANUP_JOB_CMD: + return strdup("ORTE_DAEMON_DVM_CLEANUP_JOB_CMD"); + default: return strdup("Unknown Command!"); } diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 76b62f6d1ec..c21e0f54f66 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -76,6 +76,7 @@ #include "orte/util/parse_options.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/util/pre_condition_transports.h" +#include "orte/util/compress.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/ess.h" @@ -793,9 +794,58 @@ int orte_daemon(int argc, char *argv[]) /* if we are rank=1, then send our topology back - otherwise, mpirun * will request it if necessary */ if (1 == ORTE_PROC_MY_NAME->vpid) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { + opal_buffer_t data; + int8_t flag; + uint8_t *cmpdata; + size_t cmplen; + + /* setup an intermediate buffer */ + OBJ_CONSTRUCT(&data, opal_buffer_t); + + if (ORTE_SUCCESS != (ret = opal_dss.pack(&data, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { ORTE_ERROR_LOG(ret); } + if (orte_util_compress_block((uint8_t*)data.base_ptr, data.bytes_used, + &cmpdata, &cmplen)) { + /* the data was compressed - mark that we compressed it */ + flag = 1; + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT8))) { + ORTE_ERROR_LOG(ret); + free(cmpdata); + OBJ_DESTRUCT(&data); + } + /* pack the compressed length */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &cmplen, 1, OPAL_SIZE))) { + ORTE_ERROR_LOG(ret); + free(cmpdata); + OBJ_DESTRUCT(&data); + } + /* pack the uncompressed length */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &data.bytes_used, 1, OPAL_SIZE))) { + ORTE_ERROR_LOG(ret); + free(cmpdata); + OBJ_DESTRUCT(&data); + } + /* pack the compressed info */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, cmpdata, cmplen, OPAL_UINT8))) { + ORTE_ERROR_LOG(ret); + free(cmpdata); + OBJ_DESTRUCT(&data); + } + OBJ_DESTRUCT(&data); + free(cmpdata); + } else { + /* mark that it was not compressed */ + flag = 0; + if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT8))) { + ORTE_ERROR_LOG(ret); + OBJ_DESTRUCT(&data); + free(cmpdata); + } + /* transfer the payload across */ + opal_dss.copy_payload(buffer, &data); + OBJ_DESTRUCT(&data); + } } /* send it to the designated target */ diff --git a/orte/orted/orted_submit.c b/orte/orted/orted_submit.c index 365203bdeb6..80090731766 100644 --- a/orte/orted/orted_submit.c +++ b/orte/orted/orted_submit.c @@ -10,9 +10,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science @@ -153,6 +153,7 @@ static void build_debugger_args(orte_app_context_t *debugger); static void open_fifo (void); static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, int argc, char *argv[], int num_procs); +static void print_help(void); /* instance the standard MPIR interfaces */ #define MPIR_MAX_PATH_LENGTH 512 @@ -322,18 +323,22 @@ int orte_submit_init(int argc, char *argv[], * exit with a giant warning flag */ if (0 == geteuid() && !orte_cmd_options.run_as_root) { + /* show_help is not yet available, so print an error manually */ fprintf(stderr, "--------------------------------------------------------------------------\n"); if (orte_cmd_options.help) { - fprintf(stderr, "%s cannot provide the help message when run as root.\n", orte_basename); + fprintf(stderr, "%s cannot provide the help message when run as root.\n\n", orte_basename); } else { - /* show_help is not yet available, so print an error manually */ - fprintf(stderr, "%s has detected an attempt to run as root.\n", orte_basename); + fprintf(stderr, "%s has detected an attempt to run as root.\n\n", orte_basename); } + fprintf(stderr, "Running as root is *strongly* discouraged as any mistake (e.g., in\n"); fprintf(stderr, "defining TMPDIR) or bug can result in catastrophic damage to the OS\n"); fprintf(stderr, "file system, leaving your system in an unusable state.\n\n"); + + fprintf(stderr, "We strongly suggest that you run %s as a non-root user.\n\n", orte_basename); + fprintf(stderr, "You can override this protection by adding the --allow-run-as-root\n"); - fprintf(stderr, "option to your cmd line. However, we reiterate our strong advice\n"); + fprintf(stderr, "option to your command line. However, we reiterate our strong advice\n"); fprintf(stderr, "against doing so - please do so at your own risk.\n"); fprintf(stderr, "--------------------------------------------------------------------------\n"); exit(1); @@ -351,24 +356,9 @@ int orte_submit_init(int argc, char *argv[], } /* Check for help request */ - if (orte_cmd_options.help) { - char *str, *args = NULL; - char *project_name = NULL; - if (0 == strcmp(orte_basename, "mpirun")) { - project_name = "Open MPI"; - } else { - project_name = "OpenRTE"; - } - args = opal_cmd_line_get_usage_msg(orte_cmd_line); - str = opal_show_help_string("help-orterun.txt", "orterun:usage", false, - orte_basename, project_name, OPAL_VERSION, - orte_basename, args, - PACKAGE_BUGREPORT); - if (NULL != str) { - printf("%s", str); - free(str); - } - free(args); + if (NULL != orte_cmd_options.help) { + print_help(); + /* If someone asks for help, that should be all we do */ exit(0); } @@ -585,6 +575,27 @@ int orte_submit_init(int argc, char *argv[], return ORTE_SUCCESS; } +static void print_help() +{ + char *str = NULL, *args; + char *project_name = NULL; + + if (0 == strcmp(orte_basename, "mpirun")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + args = opal_cmd_line_get_usage_msg(orte_cmd_line); + str = opal_show_help_string("help-orterun.txt", "orterun:usage", false, + orte_basename, project_name, OPAL_VERSION, + orte_basename, args, + PACKAGE_BUGREPORT); + if (NULL != str) { + printf("%s", str); + free(str); + } + free(args); +} void orte_submit_finalize(void) { @@ -1110,7 +1121,7 @@ int orte_submit_job(char *argv[], int *index, static int init_globals(void) { /* Reset the other fields every time */ - orte_cmd_options.help = false; + orte_cmd_options.help = NULL; orte_cmd_options.version = false; orte_cmd_options.num_procs = 0; if (NULL != orte_cmd_options.appfile) { @@ -2267,6 +2278,23 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata) opal_setenv(env_name, "1", true, &app->env); } free(env_name); + + /* setup the attach fifo in case someone wants to re-attach */ + if (orte_create_session_dirs) { + /* create the attachment FIFO and setup readevent - cannot be + * done if no session dirs exist! + */ + attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, + "debugger_attach_fifo", NULL); + if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) { + opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno); + free(attach_fifo); + return; + } + strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1); + free(attach_fifo); + open_fifo(); + } } static bool mpir_breakpoint_fired = false; @@ -2355,15 +2383,13 @@ static void orte_debugger_dump(void) "NULL" : (char*) MPIR_server_arguments); } -static void setup_debugger_job(void) +static void setup_debugger_job(orte_jobid_t jobid) { orte_job_t *debugger; orte_app_context_t *app; - orte_proc_t *proc; - int i, rc; - orte_node_t *node; - orte_vpid_t vpid=0; + int rc; char cwd[OPAL_PATH_MAX]; + bool flag = true; /* setup debugger daemon job */ debugger = OBJ_NEW(orte_job_t); @@ -2399,68 +2425,28 @@ static void setup_debugger_job(void) return; } app->cwd = strdup(cwd); - orte_remove_attribute(&app->attributes, ORTE_APP_USER_CWD); + orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, &flag, OPAL_BOOL); opal_argv_append_nosize(&app->argv, app->app); build_debugger_args(app); opal_pointer_array_add(debugger->apps, app); debugger->num_apps = 1; - /* create a job map */ + /* create the map object and set the policy to 1ppn */ debugger->map = OBJ_NEW(orte_job_map_t); - /* in building the map, we want to launch one debugger daemon - * on each node that *already has an application process on it*. - * We cannot just launch one debugger daemon on EVERY node because - * the original job may not have placed procs on every node. So - * we construct the map here by cycling across all nodes, adding - * only those nodes where num_procs > 0. - */ - for (i=0; i < orte_node_pool->size; i++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - /* if this node wasn't included in the vm, ignore it */ - if (NULL == node->daemon) { - continue; - } - /* if the node doesn't have any app procs on it, ignore it */ - if (node->num_procs < 1) { - continue; - } - /* this node has at least one proc, so add it to our map */ - OBJ_RETAIN(node); - opal_pointer_array_add(debugger->map->nodes, node); - debugger->map->num_nodes++; - /* add a debugger daemon to the node - note that the - * debugger daemon does NOT count against our subscribed slots - */ - proc = OBJ_NEW(orte_proc_t); - proc->name.jobid = debugger->jobid; - proc->name.vpid = vpid++; - /* point the proc at the local ORTE daemon as its parent */ - proc->parent = node->daemon->name.vpid; - /* set the local/node ranks - we don't actually care - * what these are, but the odls needs them - */ - proc->local_rank = 0; - proc->node_rank = 0; - proc->app_rank = proc->name.vpid; - /* flag the proc as ready for launch */ - proc->state = ORTE_PROC_STATE_INIT; - proc->app_idx = 0; - - OBJ_RETAIN(node); /* maintain accounting on object */ - proc->node = node; - /* add the proc to the job */ - opal_pointer_array_set_item(debugger->procs, proc->name.vpid, proc); - debugger->num_procs++; - - /* add the proc to the node's array */ - OBJ_RETAIN(proc); - opal_pointer_array_add(node->procs, (void*)proc); - node->num_procs++; - } - /* schedule it for launch */ - debugger->state = ORTE_JOB_STATE_INIT; - ORTE_ACTIVATE_JOB_STATE(debugger, ORTE_JOB_STATE_LAUNCH_APPS); + ORTE_SET_MAPPING_POLICY(debugger->map->mapping, ORTE_MAPPING_PPR); + ORTE_SET_MAPPING_DIRECTIVE(debugger->map->mapping, ORTE_MAPPING_GIVEN); + ORTE_SET_MAPPING_DIRECTIVE(debugger->map->mapping, ORTE_MAPPING_DEBUGGER); + /* define the ppr */ + debugger->map->ppr = strdup("1:node"); + /* mark that we do not want the daemon bound */ + if (ORTE_SUCCESS != (rc = opal_hwloc_base_set_binding_policy(&debugger->map->binding, "none"))) { + ORTE_ERROR_LOG(rc); + return; + } + /* spawn it */ + rc = orte_plm.spawn(debugger); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } } /* @@ -2616,7 +2602,7 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon); - setup_debugger_job(); + setup_debugger_job(jdata->jobid); } /* we don't have anything else to do */ OBJ_RELEASE(caddy); @@ -2908,7 +2894,7 @@ static void attach_debugger(int fd, short event, void *arg) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon); - setup_debugger_job(); + setup_debugger_job(ORTE_JOBID_WILDCARD); did_once = true; } diff --git a/orte/orted/orted_submit.h b/orte/orted/orted_submit.h index f0fd4babea4..e325a0a04f5 100644 --- a/orte/orted/orted_submit.h +++ b/orte/orted/orted_submit.h @@ -3,6 +3,8 @@ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -45,7 +47,7 @@ extern char MPIR_attach_fifo[]; * Global struct for caching orte command line options. */ struct orte_cmd_options_t { - bool help; + char *help; bool version; bool verbose; char *report_pid; diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index 25d37b32330..63b4dbfdd39 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -83,6 +83,8 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tg, void *cbdata); +#define ORTE_PMIX_SERVER_MIN_ROOMS 4096 + pmix_server_globals_t orte_pmix_server_globals = {0}; static opal_pmix_server_module_t pmix_server = { @@ -102,7 +104,9 @@ static opal_pmix_server_module_t pmix_server = { .notify_event = pmix_server_notify_event, .query = pmix_server_query_fn, .tool_connected = pmix_tool_connected_fn, - .log = pmix_server_log_fn + .log = pmix_server_log_fn, + .allocate = pmix_server_alloc_fn, + .job_control = pmix_server_job_ctrl_fn }; void pmix_server_register_params(void) @@ -120,7 +124,7 @@ void pmix_server_register_params(void) orte_pmix_server_globals.verbosity); } /* specify the size of the hotel */ - orte_pmix_server_globals.num_rooms = 256; + orte_pmix_server_globals.num_rooms = -1; (void) mca_base_var_register ("orte", "pmix", NULL, "server_max_reqs", "Maximum number of backlogged PMIx server direct modex requests", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -156,7 +160,7 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel, { pmix_server_req_t *req = (pmix_server_req_t*)occupant; bool timeout = false; - int rc; + int rc=OPAL_ERR_TIMEOUT; /* decrement the request timeout */ req->timeout -= orte_pmix_server_globals.timeout; @@ -173,6 +177,8 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel, } ORTE_ERROR_LOG(rc); /* fall thru and return an error so the caller doesn't hang */ + } else { + orte_show_help("help-orted.txt", "timedout", true, req->operation); } /* don't let the caller hang */ if (NULL != req->opcbfunc) { @@ -203,6 +209,17 @@ int pmix_server_init(void) /* setup the server's state variables */ OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t); + /* by the time we init the server, we should know how many nodes we + * have in our environment - with the exception of mpirun. If the + * user specified the size of the hotel, then use that value. Otherwise, + * set the value to something large to avoid running out of rooms on + * large machines */ + if (-1 == orte_pmix_server_globals.num_rooms) { + orte_pmix_server_globals.num_rooms = orte_process_info.num_procs * 2; + if (orte_pmix_server_globals.num_rooms < ORTE_PMIX_SERVER_MIN_ROOMS) { + orte_pmix_server_globals.num_rooms = ORTE_PMIX_SERVER_MIN_ROOMS; + } + } if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs, orte_pmix_server_globals.num_rooms, orte_event_base, orte_pmix_server_globals.timeout*1000000, @@ -265,6 +282,12 @@ int pmix_server_init(void) kv->type = OPAL_BOOL; kv->data.flag = true; opal_list_append(&info, &kv->super); + /* tell the server to use its own internal monitoring */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_SERVER_ENABLE_MONITORING); + kv->type = OPAL_BOOL; + kv->data.flag = true; + opal_list_append(&info, &kv->super); /* setup the local server */ if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) { @@ -525,10 +548,15 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender, * condition, so just log the request and we will fill * it later */ req = OBJ_NEW(pmix_server_req_t); + (void)asprintf(&req->operation, "DMDX: %s:%d", __FILE__, __LINE__); req->proxy = *sender; req->target = idreq; req->remote_room_num = room_num; + /* adjust the timeout to reflect the size of the job as it can take some + * amount of time to start the job */ + ORTE_ADJUST_TIMEOUT(req); if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); OBJ_RELEASE(req); send_error(rc, &idreq, sender); } @@ -547,10 +575,15 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender, /* track the request since the call down to the PMIx server * is asynchronous */ req = OBJ_NEW(pmix_server_req_t); + (void)asprintf(&req->operation, "DMDX: %s:%d", __FILE__, __LINE__); req->proxy = *sender; req->target = idreq; req->remote_room_num = room_num; + /* adjust the timeout to reflect the size of the job as it can take some + * amount of time to start the job */ + ORTE_ADJUST_TIMEOUT(req); if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); OBJ_RELEASE(req); send_error(rc, &idreq, sender); return; @@ -682,6 +715,7 @@ OBJ_CLASS_INSTANCE(orte_pmix_server_op_caddy_t, static void rqcon(pmix_server_req_t *p) { + p->operation = NULL; p->target = *ORTE_NAME_INVALID; p->proxy = *ORTE_NAME_INVALID; p->timeout = orte_pmix_server_globals.timeout; @@ -696,6 +730,9 @@ static void rqcon(pmix_server_req_t *p) } static void rqdes(pmix_server_req_t *p) { + if (NULL != p->operation) { + free(p->operation); + } if (NULL != p->jdata) { OBJ_RELEASE(p->jdata); } diff --git a/orte/orted/pmix/pmix_server_dyn.c b/orte/orted/pmix/pmix_server_dyn.c index 0c3254b0333..389c65a5fc8 100644 --- a/orte/orted/pmix/pmix_server_dyn.c +++ b/orte/orted/pmix/pmix_server_dyn.c @@ -105,7 +105,7 @@ static void spawn(int sd, short args, void *cbdata) /* add this request to our tracker hotel */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); goto callback; } @@ -511,3 +511,13 @@ int pmix_server_disconnect_fn(opal_list_t *procs, opal_list_t *info, return rc; } + +int pmix_server_alloc_fn(const opal_process_name_t *requestor, + opal_pmix_alloc_directive_t dir, + opal_list_t *info, + opal_pmix_info_cbfunc_t cbfunc, + void *cbdata) +{ + /* ORTE currently has no way of supporting allocation requests */ + return ORTE_ERR_NOT_SUPPORTED; +} diff --git a/orte/orted/pmix/pmix_server_fence.c b/orte/orted/pmix/pmix_server_fence.c index 10f750e9ca1..750ad09b398 100644 --- a/orte/orted/pmix/pmix_server_fence.c +++ b/orte/orted/pmix/pmix_server_fence.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science @@ -37,6 +37,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" @@ -148,6 +149,10 @@ static void dmodex_req(int sd, short args, void *cbdata) return; } + /* adjust the timeout to reflect the size of the job as it can take some + * amount of time to start the job */ + ORTE_ADJUST_TIMEOUT(req); + /* has anyone already requested data for this target? If so, * then the data is already on its way */ for (rnum=0; rnum < orte_pmix_server_globals.reqs.num_rooms; rnum++) { @@ -160,7 +165,7 @@ static void dmodex_req(int sd, short args, void *cbdata) /* save the request in the hotel until the * data is returned */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); /* can't just return as that would cause the requestor * to hang, so instead execute the callback */ goto callback; @@ -176,7 +181,7 @@ static void dmodex_req(int sd, short args, void *cbdata) * that we don't know about yet. In this case, just * record the request and we will process it later */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); /* can't just return as that would cause the requestor * to hang, so instead execute the callback */ goto callback; @@ -205,7 +210,7 @@ static void dmodex_req(int sd, short args, void *cbdata) /* track the request so we know the function and cbdata * to callback upon completion */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); goto callback; } diff --git a/orte/orted/pmix/pmix_server_gen.c b/orte/orted/pmix/pmix_server_gen.c index 71143d34196..9f2ae9eb76c 100644 --- a/orte/orted/pmix/pmix_server_gen.c +++ b/orte/orted/pmix/pmix_server_gen.c @@ -40,10 +40,12 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/iof/iof.h" #include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/schizo/schizo.h" #include "orte/mca/state/state.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/plm_private.h" #include "pmix_server_internal.h" @@ -609,7 +611,15 @@ static void _query(int sd, short args, void *cbdata) * and ask directly for the info - if rank=wildcard, then * we need to xcast the request and collect the results */ } - + } else if (0 == strcmp(q->keys[n], OPAL_PMIX_TIME_REMAINING)) { + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_TIME_REMAINING); + kv->type = OPAL_UINT32; + if (ORTE_SUCCESS != orte_schizo.get_remaining_time(&kv->data.uint32)) { + OBJ_RELEASE(kv); + } else { + opal_list_append(results, &kv->super); + } } } } @@ -811,3 +821,62 @@ void pmix_server_log_fn(opal_process_name_t *requestor, cbfunc(OPAL_SUCCESS, cbdata); } } + +int pmix_server_job_ctrl_fn(const opal_process_name_t *requestor, + opal_list_t *targets, + opal_list_t *info, + opal_pmix_info_cbfunc_t cbfunc, + void *cbdata) +{ + opal_value_t *val; + int rc, n; + orte_proc_t *proc; + opal_pointer_array_t parray, *ptrarray; + opal_namelist_t *nm; + + opal_output_verbose(2, orte_pmix_server_globals.output, + "%s job control request from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(requestor)); + + OPAL_LIST_FOREACH(val, info, opal_value_t) { + if (NULL == val->key) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + continue; + } + + if (0 == strcmp(val->key, OPAL_PMIX_JOB_CTRL_KILL)) { + /* convert the list of targets to a pointer array */ + if (NULL == targets) { + ptrarray = NULL; + } else { + OBJ_CONSTRUCT(&parray, opal_pointer_array_t); + OPAL_LIST_FOREACH(nm, targets, opal_namelist_t) { + /* get the proc object for this proc */ + if (NULL == (proc = orte_get_proc_object(&nm->name))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + continue; + } + OBJ_RETAIN(proc); + opal_pointer_array_add(&parray, proc); + } + ptrarray = &parray; + } + if (ORTE_SUCCESS != (rc = orte_plm.terminate_procs(ptrarray))) { + ORTE_ERROR_LOG(rc); + } + if (NULL != ptrarray) { + /* cleanup the array */ + for (n=0; n < parray.size; n++) { + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(&parray, n))) { + OBJ_RELEASE(proc); + } + } + OBJ_DESTRUCT(&parray); + } + continue; + } + } + + return ORTE_SUCCESS; +} diff --git a/orte/orted/pmix/pmix_server_internal.h b/orte/orted/pmix/pmix_server_internal.h index 31766eba017..5712529b5c7 100644 --- a/orte/orted/pmix/pmix_server_internal.h +++ b/orte/orted/pmix/pmix_server_internal.h @@ -12,7 +12,7 @@ * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -48,11 +48,21 @@ BEGIN_C_DECLS +#define ORTED_PMIX_MIN_DMX_TIMEOUT 10 +#define ORTE_ADJUST_TIMEOUT(a) \ + do { \ + (a)->timeout = (2 * orte_process_info.num_daemons) / 1000; \ + if ((a)->timeout < ORTED_PMIX_MIN_DMX_TIMEOUT) { \ + (a)->timeout = ORTED_PMIX_MIN_DMX_TIMEOUT; \ + } \ + } while(0) + /* object for tracking requests so we can * correctly route the eventual reply */ typedef struct { opal_object_t super; opal_event_t ev; + char *operation; int status; int timeout; int room_num; @@ -100,6 +110,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t); do { \ pmix_server_req_t *_req; \ _req = OBJ_NEW(pmix_server_req_t); \ + (void)asprintf(&_req->operation, "DMDX: %s:%d", __FILE__, __LINE__); \ _req->target = (p); \ _req->mdxcbfunc = (ocf); \ _req->cbdata = (ocd); \ @@ -113,6 +124,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t); do { \ pmix_server_req_t *_req; \ _req = OBJ_NEW(pmix_server_req_t); \ + (void)asprintf(&_req->operation, "SPAWN: %s:%d", __FILE__, __LINE__); \ _req->jdata = (j); \ _req->spcbfunc = (ocf); \ _req->cbdata = (ocd); \ @@ -206,6 +218,18 @@ extern void pmix_server_log_fn(opal_process_name_t *requestor, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); +extern int pmix_server_alloc_fn(const opal_process_name_t *requestor, + opal_pmix_alloc_directive_t dir, + opal_list_t *info, + opal_pmix_info_cbfunc_t cbfunc, + void *cbdata); + +extern int pmix_server_job_ctrl_fn(const opal_process_name_t *requestor, + opal_list_t *targets, + opal_list_t *info, + opal_pmix_info_cbfunc_t cbfunc, + void *cbdata); + /* declare the RML recv functions for responses */ extern void pmix_server_launch_resp(int status, orte_process_name_t* sender, opal_buffer_t *buffer, diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index 0b3ec8d109f..4dcb9cfb755 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science @@ -38,6 +38,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/rml/rml.h" @@ -52,7 +53,7 @@ static void execute(int sd, short args, void *cbdata) /* add this request to our tracker hotel */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { - ORTE_ERROR_LOG(rc); + orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms); goto callback; } @@ -100,6 +101,7 @@ int pmix_server_publish_fn(opal_process_name_t *proc, /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); + (void)asprintf(&req->operation, "PUBLISH: %s:%d", __FILE__, __LINE__); req->opcbfunc = cbfunc; req->cbdata = cbdata; @@ -207,6 +209,7 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys, /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); + (void)asprintf(&req->operation, "LOOKUP: %s:%d", __FILE__, __LINE__); req->lkcbfunc = cbfunc; req->cbdata = cbdata; @@ -302,6 +305,7 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys, /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); + (void)asprintf(&req->operation, "UNPUBLISH: %s:%d", __FILE__, __LINE__); req->opcbfunc = cbfunc; req->cbdata = cbdata; @@ -468,4 +472,3 @@ void pmix_server_keyval_client(int status, orte_process_name_t* sender, OBJ_RELEASE(req); } } - diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index d4a740f3864..68826c4abf0 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -108,6 +108,8 @@ bool orte_display_allocation = false; bool orte_display_devel_allocation = false; bool orte_soft_locations = false; int orted_pmi_version = 0; +bool orte_nidmap_communicated = false; +bool orte_node_info_communicated = false; /* launch agents */ char *orte_launch_agent = NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index b864e5cd8e1..0b46dfc73db 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -491,6 +491,8 @@ ORTE_DECLSPEC extern bool orte_display_allocation; ORTE_DECLSPEC extern bool orte_display_devel_allocation; ORTE_DECLSPEC extern bool orte_soft_locations; ORTE_DECLSPEC extern bool orte_hnp_connected; +ORTE_DECLSPEC extern bool orte_nidmap_communicated; +ORTE_DECLSPEC extern bool orte_node_info_communicated; /* launch agents */ ORTE_DECLSPEC extern char *orte_launch_agent; diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index 8e5ccb82bed..03eaab0c0f6 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -13,7 +13,7 @@ * reserved. * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * @@ -37,7 +37,6 @@ #include "opal/util/error.h" #include "opal/util/output.h" #include "opal/util/proc.h" -#include "opal/util/timings.h" #include "opal/runtime/opal.h" #include "opal/threads/threads.h" @@ -152,6 +151,7 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) opal_snprintf_jobid = orte_util_snprintf_jobid; opal_convert_string_to_jobid = _convert_string_to_jobid; + /* initialize the opal layer */ if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) { error = "opal_init"; @@ -210,10 +210,12 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) error = "orte_schizo_base_open"; goto error; } + if (ORTE_SUCCESS != (ret = orte_schizo_base_select())) { error = "orte_schizo_base_select"; goto error; } + /* if we are an app, let SCHIZO help us determine our environment */ if (ORTE_PROC_IS_APP) { (void)orte_schizo.check_launch_environment(); @@ -225,6 +227,7 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) error = "orte_ess_base_open"; goto error; } + if (ORTE_SUCCESS != (ret = orte_ess_base_select())) { error = "orte_ess_base_select"; goto error; @@ -255,10 +258,6 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) opal_process_info.my_local_rank = (int32_t)orte_process_info.my_local_rank; opal_process_info.cpuset = orte_process_info.cpuset; -#if OPAL_ENABLE_TIMING - opal_timing_set_jobid(ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); -#endif - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { /* start listening - will be ignored if no listeners * were registered */ diff --git a/orte/runtime/orte_quit.c b/orte/runtime/orte_quit.c index ca383ac71d3..d665556d13e 100644 --- a/orte/runtime/orte_quit.c +++ b/orte/runtime/orte_quit.c @@ -15,7 +15,7 @@ * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -258,8 +258,8 @@ int orte_print_aborted_job(orte_job_t *job, default: if (0 != proc->exit_code) { orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true, - orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name, - (unsigned long)proc->name.vpid); + orte_basename, proc->exit_code, ORTE_ERROR_NAME(proc->exit_code), + node->name, (unsigned long)proc->name.vpid); } else { orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true, orte_basename, node->name); @@ -345,7 +345,7 @@ static void dump_aborted_procs(void) /* find the job that caused the problem */ n = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&job, &nptr); while (OPAL_SUCCESS == n) { - if (job->jobid == ORTE_PROC_MY_NAME->jobid) { + if (NULL == job || job->jobid == ORTE_PROC_MY_NAME->jobid) { goto next; } if (ORTE_JOB_STATE_UNDEF != job->state && diff --git a/orte/tools/orte-dvm/orte-dvm.c b/orte/tools/orte-dvm/orte-dvm.c index f2101c9b620..901cb90acd8 100644 --- a/orte/tools/orte-dvm/orte-dvm.c +++ b/orte/tools/orte-dvm/orte-dvm.c @@ -10,11 +10,11 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -75,6 +75,7 @@ #include "opal/class/opal_pointer_array.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/odls/odls.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/rml_contact.h" @@ -221,18 +222,22 @@ int main(int argc, char *argv[]) * exit with a giant warning flag */ if (0 == geteuid() && !myglobals.run_as_root) { + /* show_help is not yet available, so print an error manually */ fprintf(stderr, "--------------------------------------------------------------------------\n"); if (myglobals.help) { - fprintf(stderr, "%s cannot provide the help message when run as root\n", orte_basename); + fprintf(stderr, "%s cannot provide the help message when run as root.\n\n", orte_basename); } else { - /* show_help is not yet available, so print an error manually */ - fprintf(stderr, "%s has detected an attempt to run as root.\n", orte_basename); + fprintf(stderr, "%s has detected an attempt to run as root.\n\n", orte_basename); } - fprintf(stderr, " This is *strongly* discouraged as any mistake (e.g., in defining TMPDIR) or bug can\n"); - fprintf(stderr, "result in catastrophic damage to the OS file system, leaving\n"); - fprintf(stderr, "your system in an unusable state.\n\n"); + + fprintf(stderr, "Running at root is *strongly* discouraged as any mistake (e.g., in\n"); + fprintf(stderr, "defining TMPDIR) or bug can result in catastrophic damage to the OS\n"); + fprintf(stderr, "file system, leaving your system in an unusable state.\n\n"); + + fprintf(stderr, "We strongly suggest that you run %s as a non-root user.\n\n", orte_basename); + fprintf(stderr, "You can override this protection by adding the --allow-run-as-root\n"); - fprintf(stderr, "option to your cmd line. However, we reiterate our strong advice\n"); + fprintf(stderr, "option to your command line. However, we reiterate our strong advice\n"); fprintf(stderr, "against doing so - please do so at your own risk.\n"); fprintf(stderr, "--------------------------------------------------------------------------\n"); exit(1); @@ -515,6 +520,8 @@ static void notify_requestor(int sd, short args, void *cbdata) orte_proc_t *pptr; int ret, id, *idptr; opal_buffer_t *reply; + orte_daemon_cmd_flag_t command; + orte_grpcomm_signature_t *sig; /* notify the requestor */ reply = OBJ_NEW(opal_buffer_t); @@ -553,6 +560,24 @@ static void notify_requestor(int sd, short args, void *cbdata) ORTE_RML_TAG_NOTIFY_COMPLETE, send_callback, jdata); + /* now ensure that _all_ daemons know that this job has terminated so even + * those that did not participate in it will know to cleanup the resources + * they assigned to the job. This is necessary now that the mapping function + * has been moved to the backend daemons - otherwise, non-participating daemons + * retain the slot assignments on the participating daemons, and then incorrectly + * map subsequent jobs thinking those nodes are still "busy" */ + reply = OBJ_NEW(opal_buffer_t); + command = ORTE_DAEMON_DVM_CLEANUP_JOB_CMD; + opal_dss.pack(reply, &command, 1, ORTE_DAEMON_CMD); + opal_dss.pack(reply, &jdata->jobid, 1, ORTE_JOBID); + sig = OBJ_NEW(orte_grpcomm_signature_t); + sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); + sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid; + sig->signature[0].vpid = ORTE_VPID_WILDCARD; + orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, reply); + OBJ_RELEASE(reply); + OBJ_RELEASE(sig); + /* we cannot cleanup the job object as we might * hit an error during transmission, so clean it * up in the send callback */ diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index c7aca563d22..ff49f2e786b 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -12,6 +12,7 @@ # All rights reserved. # Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -296,6 +297,7 @@ while attempting to start process rank %lu. %s was unable to start the specified application as it encountered an error: +Error code: %d Error name: %s Node: %s diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 1a2b8ff40d1..1ff6a98a34d 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -10,9 +10,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science @@ -130,6 +130,8 @@ int orterun(int argc, char *argv[]) { orte_submit_status_t launchst, completest; + /* orte_submit_init() will also check if the user is running as + root (and may issue a warning/exit). */ if (ORTE_SUCCESS != orte_submit_init(argc, argv, NULL)) { exit(1); } @@ -140,7 +142,7 @@ int orterun(int argc, char *argv[]) */ if (0 == geteuid() && !orte_cmd_options.run_as_root) { fprintf(stderr, "--------------------------------------------------------------------------\n"); - if (orte_cmd_options.help) { + if (NULL != orte_cmd_options.help) { fprintf(stderr, "%s cannot provide the help message when run as root.\n", orte_basename); } else { /* show_help is not yet available, so print an error manually */ diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index 3e9c2239b57..801373cb669 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -89,7 +89,7 @@ int orte_err2str(int errnum, const char **errmsg) if (orte_report_silent_errors) { retval = "Silent error"; } else { - retval = NULL; + retval = ""; } break; case ORTE_ERR_ADDRESSEE_UNKNOWN: @@ -174,7 +174,7 @@ int orte_err2str(int errnum, const char **errmsg) if (orte_report_silent_errors) { retval = "Next option"; } else { - retval = NULL; + retval = ""; } break; case ORTE_ERR_SENSOR_LIMIT_EXCEEDED: @@ -244,11 +244,7 @@ int orte_err2str(int errnum, const char **errmsg) retval = "Partial success"; break; default: - if (orte_report_silent_errors) { - retval = "Unknown error"; - } else { - retval = NULL; - } + retval = "Unknown error"; } *errmsg = retval; diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index be0437bf209..02ef5b8e7d8 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -62,6 +62,7 @@ #include "orte/mca/dfs/dfs.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/base/odls_private.h" +#include "orte/mca/rmaps/base/base.h" #include "orte/mca/routed/routed.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" @@ -73,24 +74,18 @@ #include "orte/util/nidmap.h" -int orte_util_build_daemon_nidmap(char **nodes) +int orte_util_build_daemon_nidmap(void) { - int i, num_nodes; + int i; int rc; struct hostent *h; + orte_node_t *node; opal_buffer_t buf; opal_process_name_t proc; char *uri, *addr; char *proc_name; opal_value_t kv; - num_nodes = opal_argv_count(nodes); - - if (0 == num_nodes) { - /* nothing to do */ - return ORTE_SUCCESS; - } - /* install the entry for the HNP */ proc.jobid = ORTE_PROC_MY_NAME->jobid; proc.vpid = 0; @@ -105,16 +100,22 @@ int orte_util_build_daemon_nidmap(char **nodes) } OBJ_DESTRUCT(&kv); - /* the daemon vpids will be assigned in order, - * starting with vpid=0 for the HNP */ + /* we must have already built the node pool, so cycle across it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); - for (i=0; i < num_nodes; i++) { + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + if (NULL == node->daemon) { + /* this node isn't occupied */ + continue; + } /* define the vpid for this daemon */ - proc.vpid = i; + proc.vpid = node->daemon->name.vpid; /* store the hostname for the proc */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_HOSTNAME); - kv.data.string = strdup(nodes[i]); + kv.data.string = strdup(node->name); kv.type = OPAL_STRING; if (OPAL_SUCCESS != (rc = opal_pmix.store_local(&proc, &kv))) { ORTE_ERROR_LOG(rc); @@ -138,7 +139,7 @@ int orte_util_build_daemon_nidmap(char **nodes) OBJ_DESTRUCT(&kv); /* lookup the address of this node */ - if (NULL == (h = gethostbyname(nodes[i]))) { + if (NULL == (h = gethostbyname(node->name))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } @@ -157,7 +158,11 @@ int orte_util_build_daemon_nidmap(char **nodes) OPAL_OUTPUT_VERBOSE((2, orte_debug_verbosity, "%s orte:util:build:daemon:nidmap node %s daemon %d addr %s uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nodes[i], i+1, addr, uri)); + node->name, i+1, addr, uri)); + /* if this is the HNP, then store it */ + if (!ORTE_PROC_IS_HNP && 0 == i) { + orte_process_info.my_hnp_uri = strdup(uri); + } opal_dss.pack(&buf, &uri, 1, OPAL_STRING); free(proc_name); free(uri); @@ -172,136 +177,69 @@ int orte_util_build_daemon_nidmap(char **nodes) return rc; } -int orte_util_encode_nodemap(opal_buffer_t *buffer) +int orte_util_nidmap_create(char **regex) { char *node; char prefix[ORTE_MAX_NODE_PREFIX]; int i, j, n, len, startnum, nodenum, numdigits; - bool found, fullname, test; - char *suffix, *sfx; + bool found, fullname; + char *suffix, *sfx, *nodenames; orte_regex_node_t *ndreg; - orte_regex_range_t *range, *rng, *slt, *tp, *flg; - opal_list_t nodenms, dvpids, slots, topos, flags; + orte_regex_range_t *range, *rng; + opal_list_t nodenms, dvpids; opal_list_item_t *item, *itm2; char **regexargs = NULL, *tmp, *tmp2; orte_node_t *nptr; - int rc; - uint8_t ui8; + orte_vpid_t vpid; - /* setup the list of results */ OBJ_CONSTRUCT(&nodenms, opal_list_t); OBJ_CONSTRUCT(&dvpids, opal_list_t); - OBJ_CONSTRUCT(&slots, opal_list_t); - OBJ_CONSTRUCT(&topos, opal_list_t); - OBJ_CONSTRUCT(&flags, opal_list_t); rng = NULL; - slt = NULL; - tp = NULL; - flg = NULL; for (n=0; n < orte_node_pool->size; n++) { if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { continue; } /* if no daemon has been assigned, then this node is not being used */ if (NULL == nptr->daemon) { - continue; + vpid = -1; // indicates no daemon assigned + } else { + vpid = nptr->daemon->name.vpid; } /* deal with the daemon vpid - see if it is next in the * current range */ if (NULL == rng) { /* just starting */ rng = OBJ_NEW(orte_regex_range_t); - rng->start = nptr->daemon->name.vpid; + rng->vpid = vpid; rng->cnt = 1; opal_list_append(&dvpids, &rng->super); - } else { - /* is this the next in line */ - if (nptr->daemon->name.vpid == (orte_vpid_t)(rng->start + rng->cnt)) { + } else if (UINT32_MAX == vpid) { + if (-1 == rng->vpid) { rng->cnt++; } else { /* need to start another range */ rng = OBJ_NEW(orte_regex_range_t); - rng->start = nptr->daemon->name.vpid; + rng->vpid = vpid; rng->cnt = 1; opal_list_append(&dvpids, &rng->super); } - } - /* check the #slots */ - if (NULL == slt) { - /* just starting */ - slt = OBJ_NEW(orte_regex_range_t); - slt->start = nptr->daemon->name.vpid; - slt->slots = nptr->slots; - slt->cnt = 1; - opal_list_append(&slots, &slt->super); - } else { - /* is this the next in line */ - if (nptr->slots == slt->slots) { - slt->cnt++; - } else { - /* need to start another range */ - slt = OBJ_NEW(orte_regex_range_t); - slt->start = nptr->daemon->name.vpid; - slt->slots = nptr->slots; - slt->cnt = 1; - opal_list_append(&slots, &slt->super); - } - } - /* check the topologies */ - if (NULL == tp) { - if (NULL != nptr->topology) { - /* just starting */ - tp = OBJ_NEW(orte_regex_range_t); - tp->start = nptr->daemon->name.vpid; - tp->t = nptr->topology; - tp->cnt = 1; - opal_list_append(&topos, &tp->super); - } - } else { - if (NULL != nptr->topology) { - /* is this the next in line */ - if (tp->t == nptr->topology) { - tp->cnt++; - } else { - /* need to start another range */ - tp = OBJ_NEW(orte_regex_range_t); - tp->start = nptr->daemon->name.vpid; - tp->t = nptr->topology; - tp->cnt = 1; - opal_list_append(&topos, &tp->super); - } - } - } - /* check the flags */ - test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN); - if (NULL == flg) { - /* just starting */ - flg = OBJ_NEW(orte_regex_range_t); - flg->start = nptr->daemon->name.vpid; - if (test) { - flg->slots = 1; - } else { - flg->slots = 0; - } - flg->cnt = 1; - opal_list_append(&flags, &flg->super); + } else if (-1 == rng->vpid) { + /* need to start another range */ + rng = OBJ_NEW(orte_regex_range_t); + rng->vpid = vpid; + rng->cnt = 1; + opal_list_append(&dvpids, &rng->super); } else { /* is this the next in line */ - if ((test && 1 == flg->slots) || - (!test && 0 == flg->slots)) { - flg->cnt++; + if (vpid == (orte_vpid_t)(rng->vpid + rng->cnt)) { + rng->cnt++; } else { /* need to start another range */ - flg = OBJ_NEW(orte_regex_range_t); - flg->start = nptr->daemon->name.vpid; - if (test) { - flg->slots = 1; - } else { - flg->slots = 0; - } - flg->cnt = 1; - opal_list_append(&flags, &flg->super); + rng = OBJ_NEW(orte_regex_range_t); + rng->vpid = vpid; + rng->cnt = 1; + opal_list_append(&dvpids, &rng->super); } } node = nptr->name; @@ -387,16 +325,16 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) if (NULL == range) { /* first range for this nodeid */ range = OBJ_NEW(orte_regex_range_t); - range->start = nodenum; + range->vpid = nodenum; range->cnt = 1; opal_list_append(&ndreg->ranges, &range->super); break; } /* see if the node number is out of sequence */ - if (nodenum != (range->start + range->cnt)) { + if (nodenum != (range->vpid + range->cnt)) { /* start a new range */ range = OBJ_NEW(orte_regex_range_t); - range->start = nodenum; + range->vpid = nodenum; range->cnt = 1; opal_list_append(&ndreg->ranges, &range->super); break; @@ -420,7 +358,7 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) * care of names we can't compress above */ range = OBJ_NEW(orte_regex_range_t); - range->start = nodenum; + range->vpid = nodenum; range->cnt = 1; opal_list_append(&ndreg->ranges, &range->super); } @@ -428,7 +366,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) free(suffix); } } - /* begin constructing the regular expression */ while (NULL != (item = opal_list_remove_first(&nodenms))) { ndreg = (orte_regex_node_t*)item; @@ -454,9 +391,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) while (NULL != (itm2 = opal_list_remove_first(&ndreg->ranges))) { range = (orte_regex_range_t*)itm2; if (1 == range->cnt) { - asprintf(&tmp2, "%s%d,", tmp, range->start); + asprintf(&tmp2, "%s%u,", tmp, range->vpid); } else { - asprintf(&tmp2, "%s%d-%d,", tmp, range->start, range->start + range->cnt - 1); + asprintf(&tmp2, "%s%u-%u,", tmp, range->vpid, range->vpid + range->cnt - 1); } free(tmp); tmp = tmp2; @@ -476,39 +413,28 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } /* assemble final result */ - tmp = opal_argv_join(regexargs, ','); + nodenames = opal_argv_join(regexargs, ','); /* cleanup */ opal_argv_free(regexargs); OBJ_DESTRUCT(&nodenms); - /* pack the string */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OPAL_LIST_DESTRUCT(&dvpids); - OPAL_LIST_DESTRUCT(&slots); - return rc; - } - if (NULL != tmp) { - free(tmp); - } - /* do the same for the vpids */ tmp = NULL; while (NULL != (item = opal_list_remove_first(&dvpids))) { rng = (orte_regex_range_t*)item; if (1 < rng->cnt) { if (NULL == tmp) { - asprintf(&tmp, "%d-%d", rng->start, rng->start + rng->cnt - 1); + asprintf(&tmp, "%u(%u)", rng->vpid, rng->cnt); } else { - asprintf(&tmp2, "%s,%d-%d", tmp, rng->start, rng->start + rng->cnt - 1); + asprintf(&tmp2, "%s,%u(%u)", tmp, rng->vpid, rng->cnt); free(tmp); tmp = tmp2; } } else { if (NULL == tmp) { - asprintf(&tmp, "%d", rng->start); + asprintf(&tmp, "%u", rng->vpid); } else { - asprintf(&tmp2, "%s,%d", tmp, rng->start); + asprintf(&tmp2, "%s,%u", tmp, rng->vpid); free(tmp); tmp = tmp2; } @@ -517,37 +443,142 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } OPAL_LIST_DESTRUCT(&dvpids); - /* pack the string */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { + /* now concatenate the results into one string */ + asprintf(&tmp2, "%s@%s", nodenames, tmp); + free(nodenames); + free(tmp); + + *regex = tmp2; + return ORTE_SUCCESS; +} + +int orte_util_encode_nodemap(opal_buffer_t *buffer) +{ + int n; + bool test; + orte_regex_range_t *rng, *slt, *tp, *flg; + opal_list_t slots, topos, flags; + opal_list_item_t *item; + char *tmp, *tmp2; + orte_node_t *nptr; + int rc; + uint8_t ui8; + + /* setup the list of results */ + OBJ_CONSTRUCT(&slots, opal_list_t); + OBJ_CONSTRUCT(&topos, opal_list_t); + OBJ_CONSTRUCT(&flags, opal_list_t); + + slt = NULL; + tp = NULL; + flg = NULL; + + /* pack a flag indicating if the HNP was included in the allocation */ + if (orte_hnp_is_allocated) { + ui8 = 1; + } else { + ui8 = 0; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &ui8, 1, OPAL_UINT8))) { ORTE_ERROR_LOG(rc); - OPAL_LIST_DESTRUCT(&slots); return rc; } - if (NULL != tmp) { - free(tmp); + + /* pack a flag indicating if we are in a managed allocation */ + if (orte_managed_allocation) { + ui8 = 1; + } else { + ui8 = 0; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &ui8, 1, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; } - /* do the same to pass #slots on each node */ - tmp = NULL; - while (NULL != (item = opal_list_remove_first(&slots))) { - rng = (orte_regex_range_t*)item; - if (1 < rng->cnt) { - if (NULL == tmp) { - asprintf(&tmp, "%d-%d[%d]", rng->start, rng->start + rng->cnt - 1, rng->slots); + for (n=0; n < orte_node_pool->size; n++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { + continue; + } + /* check the #slots */ + if (NULL == slt) { + /* just starting */ + slt = OBJ_NEW(orte_regex_range_t); + slt->slots = nptr->slots; + slt->cnt = 1; + opal_list_append(&slots, &slt->super); + } else { + /* is this the next in line */ + if (nptr->slots == slt->slots) { + slt->cnt++; } else { - asprintf(&tmp2, "%s,%d-%d[%d]", tmp, rng->start, rng->start + rng->cnt - 1, rng->slots); - free(tmp); - tmp = tmp2; + /* need to start another range */ + slt = OBJ_NEW(orte_regex_range_t); + slt->slots = nptr->slots; + slt->cnt = 1; + opal_list_append(&slots, &slt->super); } + } + /* check the topologies */ + if (NULL == tp) { + /* just starting */ + tp = OBJ_NEW(orte_regex_range_t); + tp->t = nptr->topology; + tp->cnt = 1; + opal_list_append(&topos, &tp->super); } else { - if (NULL == tmp) { - asprintf(&tmp, "%d[%d]", rng->start, rng->slots); + /* is this the next in line */ + if (tp->t == nptr->topology) { + tp->cnt++; } else { - asprintf(&tmp2, "%s,%d[%d]", tmp, rng->start, rng->slots); - free(tmp); - tmp = tmp2; + /* need to start another range */ + tp = OBJ_NEW(orte_regex_range_t); + tp->t = nptr->topology; + tp->cnt = 1; + opal_list_append(&topos, &tp->super); } } + /* check the flags */ + test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN); + if (NULL == flg) { + /* just starting */ + flg = OBJ_NEW(orte_regex_range_t); + if (test) { + flg->slots = 1; + } else { + flg->slots = 0; + } + flg->cnt = 1; + opal_list_append(&flags, &flg->super); + } else { + /* is this the next in line */ + if ((test && 1 == flg->slots) || + (!test && 0 == flg->slots)) { + flg->cnt++; + } else { + /* need to start another range */ + flg = OBJ_NEW(orte_regex_range_t); + if (test) { + flg->slots = 1; + } else { + flg->slots = 0; + } + flg->cnt = 1; + opal_list_append(&flags, &flg->super); + } + } + } + + /* pass #slots on each node */ + tmp = NULL; + while (NULL != (item = opal_list_remove_first(&slots))) { + rng = (orte_regex_range_t*)item; + if (NULL == tmp) { + asprintf(&tmp, "%d[%d]", rng->cnt, rng->slots); + } else { + asprintf(&tmp2, "%s,%d[%d]", tmp, rng->cnt, rng->slots); + free(tmp); + tmp = tmp2; + } OBJ_RELEASE(rng); } OPAL_LIST_DESTRUCT(&slots); @@ -565,22 +596,12 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) tmp = NULL; while (NULL != (item = opal_list_remove_first(&flags))) { rng = (orte_regex_range_t*)item; - if (1 < rng->cnt) { - if (NULL == tmp) { - asprintf(&tmp, "%d-%d[%x]", rng->start, rng->start + rng->cnt - 1, rng->slots); - } else { - asprintf(&tmp2, "%s,%d-%d[%x]", tmp, rng->start, rng->start + rng->cnt - 1, rng->slots); - free(tmp); - tmp = tmp2; - } + if (NULL == tmp) { + asprintf(&tmp, "%d[%d]", rng->cnt, rng->slots); } else { - if (NULL == tmp) { - asprintf(&tmp, "%d[%x]", rng->start, rng->slots); - } else { - asprintf(&tmp2, "%s,%d[%x]", tmp, rng->start, rng->slots); - free(tmp); - tmp = tmp2; - } + asprintf(&tmp2, "%s,%d[%d]", tmp, rng->cnt, rng->slots); + free(tmp); + tmp = tmp2; } OBJ_RELEASE(rng); } @@ -595,53 +616,44 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) free(tmp); } - /* pack a flag indicating if the HNP was included in the allocation */ - if (orte_hnp_is_allocated) { - ui8 = 1; - } else { - ui8 = 0; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &ui8, 1, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* pack a flag indicating if we are in a managed allocation */ - if (orte_managed_allocation) { - ui8 = 1; - } else { - ui8 = 0; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &ui8, 1, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* handle the topologies - as the most common case by far * is to have homogeneous topologies, we only send them - * if something is different */ + * if something is different. We know that the HNP is + * the first topology, and that any differing topology + * on the compute nodes must follow. So send the topologies + * if and only if: + * + * (a) the HNP is being used to house application procs and + * there is more than one topology on our list; or + * + * (b) the HNP is not being used, but there are more than + * two topologies on our list, thus indicating that + * there are multiple topologies on the compute nodes + */ + if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { + /* remove the first topo on the list */ + item = opal_list_remove_first(&topos); + OBJ_RELEASE(item); + } tmp = NULL; if (1 < opal_list_get_size(&topos)) { opal_buffer_t bucket, *bptr; OBJ_CONSTRUCT(&bucket, opal_buffer_t); while (NULL != (item = opal_list_remove_first(&topos))) { rng = (orte_regex_range_t*)item; - if (1 < rng->cnt) { - if (NULL == tmp) { - asprintf(&tmp, "%d-%d", rng->start, rng->start + rng->cnt - 1); - } else { - asprintf(&tmp2, "%s,%d-%d", tmp, rng->start, rng->start + rng->cnt - 1); - free(tmp); - tmp = tmp2; - } + if (NULL == rng->t) { + /* when we pass thru here prior to launching the daemons, we + * won't have topologies for them and so this entry might + * be NULL - protect ourselves */ + OBJ_RELEASE(item); + continue; + } + if (NULL == tmp) { + asprintf(&tmp, "%d", rng->cnt); } else { - if (NULL == tmp) { - asprintf(&tmp, "%d", rng->start); - } else { - asprintf(&tmp2, "%s,%d", tmp, rng->start); - free(tmp); - tmp = tmp2; - } + asprintf(&tmp2, "%s,%d", tmp, rng->cnt); + free(tmp); + tmp = tmp2; } /* pack this topology string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { @@ -693,49 +705,173 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) return ORTE_SUCCESS; } -/* decode a nodemap for a daemon */ -int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) +int orte_util_nidmap_parse(char *regex) { - int n, nn, rc; - orte_node_t *node; - size_t k, endpt, start; + char *nodelist, *vpids, *ptr; + char **nodes, **dvpids; + int rc, n, cnt; + orte_regex_range_t *rng; + opal_list_t dids; orte_job_t *daemons; - orte_proc_t *dptr; - char **nodes=NULL, *dvpids=NULL, *slots=NULL, *topos=NULL, *flags=NULL; - char *ndnames, *rmndr, **tmp; - opal_list_t dids, slts, flgs;; - opal_buffer_t *bptr=NULL; - orte_topology_t *t; - orte_regex_range_t *rng, *drng, *srng, *frng; - uint8_t ui8; + orte_node_t *nd; + orte_proc_t *proc; - /* unpack the node regex */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ndnames, &n, OPAL_STRING))) { + /* if we are the HNP, we don't need to parse this */ + if (ORTE_PROC_IS_HNP) { + return ORTE_SUCCESS; + } + + /* split the regex into its node and vpid parts */ + nodelist = regex; + vpids = strchr(regex, '@'); + if (NULL == vpids) { + /* indicates the regex got mangled somewhere */ + return ORTE_ERR_BAD_PARAM; + } + *vpids = '\0'; // terminate the nodelist string + ++vpids; // step over the separator + if (NULL == vpids || '\0' == *vpids) { + /* indicates the regex got mangled somewhere */ + return ORTE_ERR_BAD_PARAM; + } + + /* decompress the nodes regex */ + nodes = NULL; + if (ORTE_SUCCESS != (rc = orte_regex_extract_node_names(nodelist, &nodes))) { ORTE_ERROR_LOG(rc); return rc; } - /* it is okay for this to be NULL */ - if (NULL == ndnames) { - return ORTE_SUCCESS; + + if (NULL == nodes) { + /* should not happen */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; } + /* decompress the vpids */ OBJ_CONSTRUCT(&dids, opal_list_t); + dvpids = opal_argv_split(vpids, ','); + for (n=0; NULL != dvpids[n]; n++) { + rng = OBJ_NEW(orte_regex_range_t); + opal_list_append(&dids, &rng->super); + /* check for a count */ + if (NULL != (ptr = strchr(dvpids[n], '('))) { + *ptr = '\0'; + dvpids[n][strlen(dvpids[n])-2] = '\0'; // remove trailing paren + ++ptr; + rng->cnt = strtoul(ptr, NULL, 10); + } + /* convert the number - since it might be a range, + * save the remainder pointer */ + rng->vpid = strtoul(dvpids[n], NULL, 10); + } + opal_argv_free(dvpids); + + /* get the daemon job object */ + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + + /* create the node pool array - this will include + * _all_ nodes known to the allocation */ + rng = (orte_regex_range_t*)opal_list_get_first(&dids); + cnt = 0; + for (n=0; NULL != nodes[n]; n++) { + nd = OBJ_NEW(orte_node_t); + nd->name = nodes[n]; + opal_pointer_array_set_item(orte_node_pool, n, nd); + /* see if it has a daemon on it */ + if (-1 != rng->vpid) { + /* we have a daemon, so let's create the tracker for it */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, rng->vpid+cnt))) { + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = ORTE_PROC_MY_NAME->jobid; + proc->name.vpid = rng->vpid + cnt; + proc->state = ORTE_PROC_STATE_RUNNING; + ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE); + daemons->num_procs++; + opal_pointer_array_set_item(daemons->procs, proc->name.vpid, proc); + } + nd->index = proc->name.vpid; + OBJ_RETAIN(nd); + proc->node = nd; + OBJ_RETAIN(proc); + nd->daemon = proc; + } + ++cnt; + if (cnt == rng->cnt) { + rng = (orte_regex_range_t*)opal_list_get_next(&rng->super); + if (NULL == rng) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + } + } + + /* unpdate num procs */ + if (orte_process_info.num_procs != daemons->num_procs) { + orte_process_info.num_procs = daemons->num_procs; + /* need to update the routing plan */ + orte_routed.update_routing_plan(NULL); + } + + if (orte_process_info.max_procs < orte_process_info.num_procs) { + orte_process_info.max_procs = orte_process_info.num_procs; + } + + if (0 < opal_output_get_verbosity(orte_debug_verbosity)) { + int i; + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + opal_output(0, "%s node[%d].name %s daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i, + (NULL == nd->name) ? "NULL" : nd->name, + (NULL == nd->daemon) ? "NONE" : ORTE_VPID_PRINT(nd->daemon->name.vpid)); + } + } + + return ORTE_SUCCESS; +} + +/* decode a nodemap for a daemon */ +int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) +{ + int n, nn, rc, cnt, offset; + orte_node_t *node; + char *slots=NULL, *topos=NULL, *flags=NULL; + char *rmndr, **tmp; + opal_list_t slts, flgs;; + opal_buffer_t *bptr=NULL; + orte_topology_t *t2; + orte_regex_range_t *rng, *srng, *frng; + uint8_t ui8; + OBJ_CONSTRUCT(&slts, opal_list_t); OBJ_CONSTRUCT(&flgs, opal_list_t); - /* unpack the daemon vpid regex */ + /* unpack the flag indicating if the HNP was allocated */ n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &dvpids, &n, OPAL_STRING))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ui8, &n, OPAL_UINT8))) { ORTE_ERROR_LOG(rc); goto cleanup; } - /* this is not allowed to be NULL */ - if (NULL == dvpids) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - rc = ORTE_ERR_BAD_PARAM; + if (0 == ui8) { + orte_hnp_is_allocated = false; + } else { + orte_hnp_is_allocated = true; + } + + /* unpack the flag indicating we are in a managed allocation */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ui8, &n, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); goto cleanup; } + if (0 == ui8) { + orte_managed_allocation = false; + } else { + orte_managed_allocation = true; + } /* unpack the slots regex */ n = 1; @@ -763,30 +899,6 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) goto cleanup; } - /* unpack the flag indicating if the HNP was allocated */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ui8, &n, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (0 == ui8) { - orte_hnp_is_allocated = false; - } else { - orte_hnp_is_allocated = true; - } - - /* unpack the flag indicating we are in a managed allocation */ - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &ui8, &n, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (0 == ui8) { - orte_managed_allocation = false; - } else { - orte_managed_allocation = true; - } - /* unpack the topos regex - this may not have been * provided (e.g., for a homogeneous machine) */ n = 1; @@ -811,38 +923,6 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) goto cleanup; } - /* decompress the regex */ - nodes = NULL; - if (ORTE_SUCCESS != (rc = orte_regex_extract_node_names(ndnames, &nodes))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - if (NULL == nodes) { - /* should not happen */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto cleanup; - } - - /* decompress the vpids */ - tmp = opal_argv_split(dvpids, ','); - for (n=0; NULL != tmp[n]; n++) { - rng = OBJ_NEW(orte_regex_range_t); - opal_list_append(&dids, &rng->super); - /* convert the number - since it might be a range, - * save the remainder pointer */ - rng->start = strtoul(tmp[n], &rmndr, 10); - if (NULL == rmndr || 0 == strlen(rmndr)) { - rng->endpt = rng->start; - } else { - /* it must be a range - find the endpoint */ - ++rmndr; - rng->endpt = strtoul(rmndr, NULL, 10); - } - } - opal_argv_free(tmp); - /* decompress the slots */ tmp = opal_argv_split(slots, ','); for (n=0; NULL != tmp[n]; n++) { @@ -861,16 +941,8 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) /* convert that number as this is the number of * slots for this range */ rng->slots = strtoul(rmndr, NULL, 10); - /* convert the starting pt - since it might be a range, - * save the remainder pointer */ - rng->start = strtoul(tmp[n], &rmndr, 10); - if (NULL == rmndr || 0 == strlen(rmndr)) { - rng->endpt = rng->start; - } else { - /* it must be a range - find the endpoint */ - ++rmndr; - rng->endpt = strtoul(rmndr, NULL, 10); - } + /* convert the initial number as that is the cnt */ + rng->cnt = strtoul(tmp[n], NULL, 10); } opal_argv_free(tmp); @@ -878,7 +950,7 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) tmp = opal_argv_split(flags, ','); for (n=0; NULL != tmp[n]; n++) { rng = OBJ_NEW(orte_regex_range_t); - opal_list_append(&dids, &rng->super); + opal_list_append(&flgs, &rng->super); /* find the '[' as that delimits the value */ rmndr = strchr(tmp[n], '['); if (NULL == rmndr) { @@ -895,43 +967,23 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) } else { rng->slots = 0; } - /* convert the starting pt - since it might be a range, - * save the remainder pointer */ - rng->start = strtoul(tmp[n], &rmndr, 10); - if (NULL == rmndr || 0 == strlen(rmndr)) { - rng->endpt = rng->start; - } else { - /* it must be a range - find the endpoint */ - ++rmndr; - rng->endpt = strtoul(rmndr, NULL, 10); - } + /* convert the initial number as that is the cnt */ + rng->cnt = strtoul(tmp[n], NULL, 10); } opal_argv_free(tmp); free(flags); - /* get the daemon job object */ - daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - /* update the node array */ - drng = (orte_regex_range_t*)opal_list_get_first(&dids); srng = (orte_regex_range_t*)opal_list_get_first(&slts); frng = (orte_regex_range_t*)opal_list_get_first(&flgs); - for (n=0; NULL != nodes[n]; n++) { - /* the daemon vpids for these nodes will be in the dids array, so - * use those to lookup the nodes */ - nn = drng->start + n; - if (nn == drng->endpt) { - drng = (orte_regex_range_t*)opal_list_get_next(&drng->super); - } - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nn))) { - node = OBJ_NEW(orte_node_t); - node->name = nodes[n]; - node->index = nn; - opal_pointer_array_set_item(orte_node_pool, nn, node); + for (n=0; n < orte_node_pool->size; n++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { + continue; } /* set the number of slots */ node->slots = srng->slots; - if (srng->endpt == nn) { + srng->cnt--; + if (0 == srng->cnt) { srng = (orte_regex_range_t*)opal_list_get_next(&srng->super); } /* set the flags */ @@ -940,52 +992,21 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) } else { ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN); } - if (frng->endpt == nn) { + frng->cnt--; + if (0 == frng->cnt) { frng = (orte_regex_range_t*)opal_list_get_next(&frng->super); } - ++orte_process_info.num_nodes; - /* if this is me, just ignore the rest as we are all setup */ - if (nn == (int)ORTE_PROC_MY_NAME->vpid) { - continue; - } - if (NULL != node->daemon) { - OBJ_RELEASE(node->daemon); - node->daemon = NULL; - } - if (NULL == (dptr = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, nn))) { - /* create a daemon object for this node */ - dptr = OBJ_NEW(orte_proc_t); - dptr->name.jobid = ORTE_PROC_MY_NAME->jobid; - dptr->name.vpid = nn; - ORTE_FLAG_SET(dptr, ORTE_PROC_FLAG_ALIVE); // assume the daemon is alive until discovered otherwise - opal_pointer_array_set_item(daemons->procs, nn, dptr); - ++daemons->num_procs; - } else if (NULL != dptr->node) { - OBJ_RELEASE(dptr->node); - dptr->node = NULL; - } - /* link the node to the daemon */ - OBJ_RETAIN(dptr); - node->daemon = dptr; - /* link the node to the daemon */ - OBJ_RETAIN(node); - dptr->node = node; } - /* we cannot use opal_argv_free here as this would release - * all the node names themselves. Instead, we just free the - * array of string pointers, leaving the strings alone */ - free(nodes); /* if no topology info was passed, then everyone shares our topology */ if (NULL == bptr) { - orte_topology_t *t; /* our topology is first in the array */ - t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); + t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); for (n=0; n < orte_node_pool->size; n++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { if (NULL == node->topology) { - OBJ_RETAIN(t); - node->topology = t; + OBJ_RETAIN(t2); + node->topology = t2; } } } @@ -995,7 +1016,9 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) /* decompress the topology regex */ tmp = opal_argv_split(topos, ','); /* there must be a topology definition for each range */ + offset = 0; for (nn=0; NULL != tmp[nn]; nn++) { + cnt = strtoul(tmp[nn], NULL, 10); /* unpack the signature */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &sig, &n, OPAL_STRING))) { @@ -1004,6 +1027,13 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) OBJ_RELEASE(bptr); goto cleanup; } + if (NULL == sig) { + rc = ORTE_ERR_BAD_PARAM; + ORTE_ERROR_LOG(rc); + opal_argv_free(tmp); + OBJ_RELEASE(bptr); + goto cleanup; + } n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) { ORTE_ERROR_LOG(rc); @@ -1013,11 +1043,12 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) goto cleanup; } /* see if we already have this topology - could be an update */ + t2 = NULL; for (n=0; n < orte_node_topologies->size; n++) { - if (NULL == (t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) { + if (NULL == (t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, n))) { continue; } - if (0 == strcmp(t->sig, sig)) { + if (0 == strcmp(t2->sig, sig)) { /* found a match */ free(sig); opal_hwloc_base_free_topology(topo); @@ -1025,63 +1056,32 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) break; } } - if (NULL != sig) { + if (NULL != sig || NULL == t2) { /* new topology - record it */ - t = OBJ_NEW(orte_topology_t); - t->sig = sig; - t->topo = topo; + t2 = OBJ_NEW(orte_topology_t); + t2->sig = sig; + t2->topo = topo; + opal_pointer_array_add(orte_node_topologies, t2); } - /* point each of the nodes in the regex to this topology */ - start = strtoul(tmp[nn], &rmndr, 10); - if (NULL != rmndr) { - /* it must be a range - find the endpoint */ - ++rmndr; - endpt = strtoul(rmndr, NULL, 10); - } else { - endpt = start; - } - for (k=start; k <= endpt; k++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, k))) { - if (NULL == node->topology) { - OBJ_RETAIN(t); - node->topology = t; - } + /* point each of the nodes in this range to this topology */ + n=0; + while (n < cnt && (n+offset) < orte_node_pool->size) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n+offset))) { + continue; } + if (NULL == node->topology) { + OBJ_RETAIN(t2); + node->topology = t2; + } + ++n; } + offset += cnt; } OBJ_RELEASE(bptr); opal_argv_free(tmp); } - /* unpdate num procs */ - if (orte_process_info.num_procs != daemons->num_procs) { - orte_process_info.num_procs = daemons->num_procs; - /* need to update the routing plan */ - orte_routed.update_routing_plan(NULL); - } - - if (orte_process_info.max_procs < orte_process_info.num_procs) { - orte_process_info.max_procs = orte_process_info.num_procs; - } - - /* update num_daemons */ - orte_process_info.num_daemons = daemons->num_procs; - - if (0 < opal_output_get_verbosity(orte_debug_verbosity)) { - int i; - for (i=0; i < orte_node_pool->size; i++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - opal_output(0, "%s node[%d].name %s daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i, - (NULL == node->name) ? "NULL" : node->name, - (NULL == node->daemon) ? "NONE" : ORTE_VPID_PRINT(node->daemon->name.vpid)); - } - } - cleanup: - OPAL_LIST_DESTRUCT(&dids); OPAL_LIST_DESTRUCT(&slts); OPAL_LIST_DESTRUCT(&flgs); return rc; diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index e91be60e001..521cc352c0e 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -37,10 +37,16 @@ BEGIN_C_DECLS +#define ORTE_MAX_REGEX_CMD_LENGTH 1024 + #define ORTE_MAX_NODE_PREFIX 50 #define ORTE_CONTIG_NODE_CMD 0x01 #define ORTE_NON_CONTIG_NODE_CMD 0x02 + +ORTE_DECLSPEC int orte_util_nidmap_create(char **regex); +ORTE_DECLSPEC int orte_util_nidmap_parse(char *regex); + /* create a regular expression describing the nodes in the * allocation */ ORTE_DECLSPEC int orte_util_encode_nodemap(opal_buffer_t *buffer); @@ -49,11 +55,7 @@ ORTE_DECLSPEC int orte_util_encode_nodemap(opal_buffer_t *buffer); * into the orte_node_pool array */ ORTE_DECLSPEC int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer); -ORTE_DECLSPEC int orte_util_build_daemon_nidmap(char **nodes); - -ORTE_DECLSPEC int orte_util_encode_topologies(opal_buffer_t *buffer); - -ORTE_DECLSPEC int orte_util_decode_topologies(opal_buffer_t *buffer); +ORTE_DECLSPEC int orte_util_build_daemon_nidmap(void); END_C_DECLS diff --git a/orte/util/regex.c b/orte/util/regex.c index a723c877dbd..f59ed0000f6 100644 --- a/orte/util/regex.c +++ b/orte/util/regex.c @@ -63,230 +63,6 @@ static int regex_parse_node_ranges(char *base, char *ranges, int num_digits, char *suffix, char ***names); static int regex_parse_node_range(char *base, char *range, int num_digits, char *suffix, char ***names); -int orte_regex_create(char *nodelist, char **regexp) -{ - char *node; - char prefix[ORTE_MAX_NODE_PREFIX]; - int i, j, len, startnum, nodenum, numdigits; - bool found, fullname; - char *suffix, *sfx; - orte_regex_node_t *ndreg; - orte_regex_range_t *range; - opal_list_t nodeids; - opal_list_item_t *item, *itm2; - char **regexargs = NULL, *tmp, *tmp2; - char *cptr; - - /* define the default */ - *regexp = NULL; - - cptr = strchr(nodelist, ','); - if (NULL == cptr) { - /* if there is only one node, don't bother */ - *regexp = strdup(nodelist); - return ORTE_SUCCESS; - } - - /* setup the list of results */ - OBJ_CONSTRUCT(&nodeids, opal_list_t); - - /* cycle thru the array of nodenames */ - node = nodelist; - while (NULL != (cptr = strchr(node, ',')) || 0 < strlen(node)) { - if (NULL != cptr) { - *cptr = '\0'; - } - /* determine this node's prefix by looking for first non-alpha char */ - fullname = false; - len = strlen(node); - startnum = -1; - memset(prefix, 0, ORTE_MAX_NODE_PREFIX); - numdigits = 0; - for (i=0, j=0; i < len; i++) { - if (!isalpha(node[i])) { - /* found a non-alpha char */ - if (!isdigit(node[i])) { - /* if it is anything but a digit, we just use - * the entire name - */ - fullname = true; - break; - } - /* count the size of the numeric field - but don't - * add the digits to the prefix - */ - numdigits++; - if (startnum < 0) { - /* okay, this defines end of the prefix */ - startnum = i; - } - continue; - } - if (startnum < 0) { - prefix[j++] = node[i]; - } - } - if (fullname || startnum < 0) { - /* can't compress this name - just add it to the list */ - ndreg = OBJ_NEW(orte_regex_node_t); - ndreg->prefix = strdup(node); - opal_list_append(&nodeids, &ndreg->super); - /* move to the next posn */ - if (NULL == cptr) { - break; - } - node = cptr + 1; - continue; - } - /* convert the digits and get any suffix */ - nodenum = strtol(&node[startnum], &sfx, 10); - if (NULL != sfx) { - suffix = strdup(sfx); - } else { - suffix = NULL; - } - /* is this nodeid already on our list? */ - found = false; - for (item = opal_list_get_first(&nodeids); - !found && item != opal_list_get_end(&nodeids); - item = opal_list_get_next(item)) { - ndreg = (orte_regex_node_t*)item; - if (0 < strlen(prefix) && NULL == ndreg->prefix) { - continue; - } - if (0 == strlen(prefix) && NULL != ndreg->prefix) { - continue; - } - if (0 < strlen(prefix) && NULL != ndreg->prefix - && 0 != strcmp(prefix, ndreg->prefix)) { - continue; - } - if (NULL == suffix && NULL != ndreg->suffix) { - continue; - } - if (NULL != suffix && NULL == ndreg->suffix) { - continue; - } - if (NULL != suffix && NULL != ndreg->suffix && - 0 != strcmp(suffix, ndreg->suffix)) { - continue; - } - if (numdigits != ndreg->num_digits) { - continue; - } - /* found a match - flag it */ - found = true; - /* get the last range on this nodeid - we do this - * to preserve order - */ - range = (orte_regex_range_t*)opal_list_get_last(&ndreg->ranges); - if (NULL == range) { - /* first range for this nodeid */ - range = OBJ_NEW(orte_regex_range_t); - range->start = nodenum; - range->cnt = 1; - opal_list_append(&ndreg->ranges, &range->super); - break; - } - /* see if the node number is out of sequence */ - if (nodenum != (range->start + range->cnt)) { - /* start a new range */ - range = OBJ_NEW(orte_regex_range_t); - range->start = nodenum; - range->cnt = 1; - opal_list_append(&ndreg->ranges, &range->super); - break; - } - /* everything matches - just increment the cnt */ - range->cnt++; - break; - } - if (!found) { - /* need to add it */ - ndreg = OBJ_NEW(orte_regex_node_t); - if (0 < strlen(prefix)) { - ndreg->prefix = strdup(prefix); - } - if (NULL != suffix) { - ndreg->suffix = strdup(suffix); - } - ndreg->num_digits = numdigits; - opal_list_append(&nodeids, &ndreg->super); - /* record the first range for this nodeid - we took - * care of names we can't compress above - */ - range = OBJ_NEW(orte_regex_range_t); - range->start = nodenum; - range->cnt = 1; - opal_list_append(&ndreg->ranges, &range->super); - } - if (NULL != suffix) { - free(suffix); - } - /* move to the next posn */ - if (NULL == cptr) { - break; - } - node = cptr + 1; - } - - /* begin constructing the regular expression */ - while (NULL != (item = opal_list_remove_first(&nodeids))) { - ndreg = (orte_regex_node_t*)item; - - /* if no ranges, then just add the name */ - if (0 == opal_list_get_size(&ndreg->ranges)) { - if (NULL != ndreg->prefix) { - /* solitary node */ - asprintf(&tmp, "%s", ndreg->prefix); - opal_argv_append_nosize(®exargs, tmp); - free(tmp); - } - OBJ_RELEASE(ndreg); - continue; - } - /* start the regex for this nodeid with the prefix */ - if (NULL != ndreg->prefix) { - asprintf(&tmp, "%s[%d:", ndreg->prefix, ndreg->num_digits); - } else { - asprintf(&tmp, "[%d:", ndreg->num_digits); - } - /* add the ranges */ - while (NULL != (itm2 = opal_list_remove_first(&ndreg->ranges))) { - range = (orte_regex_range_t*)itm2; - if (1 == range->cnt) { - asprintf(&tmp2, "%s%d,", tmp, range->start); - } else { - asprintf(&tmp2, "%s%d-%d,", tmp, range->start, range->start + range->cnt - 1); - } - free(tmp); - tmp = tmp2; - OBJ_RELEASE(range); - } - /* replace the final comma */ - tmp[strlen(tmp)-1] = ']'; - if (NULL != ndreg->suffix) { - /* add in the suffix, if provided */ - asprintf(&tmp2, "%s%s", tmp, ndreg->suffix); - free(tmp); - tmp = tmp2; - } - opal_argv_append_nosize(®exargs, tmp); - free(tmp); - OBJ_RELEASE(ndreg); - } - - /* assemble final result */ - *regexp = opal_argv_join(regexargs, ','); - /* cleanup */ - opal_argv_free(regexargs); - - OBJ_DESTRUCT(&nodeids); - - - return ORTE_SUCCESS; -} - int orte_regex_extract_node_names(char *regexp, char ***names) { int i, j, k, len, ret; @@ -592,7 +368,7 @@ static int regex_parse_node_range(char *base, char *range, int num_digits, char static void range_construct(orte_regex_range_t *ptr) { - ptr->start = 0; + ptr->vpid = 0; ptr->cnt = 0; } OBJ_CLASS_INSTANCE(orte_regex_range_t, diff --git a/orte/util/regex.h b/orte/util/regex.h index 1e8ab8bc859..b58cacb8072 100644 --- a/orte/util/regex.h +++ b/orte/util/regex.h @@ -36,8 +36,7 @@ BEGIN_C_DECLS typedef struct { opal_list_item_t super; - int start; - int endpt; + int vpid; int cnt; int slots; orte_topology_t *t; @@ -54,11 +53,6 @@ typedef struct { } orte_regex_node_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_regex_node_t); -/* NOTE: this is a destructive call for the nodes param - the - * function will search and replace all commas with '\0' - */ -ORTE_DECLSPEC int orte_regex_create(char *nodes, char **regexp); - ORTE_DECLSPEC int orte_regex_extract_node_names(char *regexp, char ***names); END_C_DECLS From dea38f82cb32e78cacbd29da8edb7a0292c353d9 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 6 May 2017 19:08:50 -0700 Subject: [PATCH 02/29] Enable full operations under SLURM on Cray systems by co-locating a daemon with mpirun when mpirun is executing on a compute node in that environment. This allows local application procs to inherit their security credential from the daemon as it will have been launched via SLURM Signed-off-by: Ralph Castain (cherry picked from commit a143800bce14a2aec2a81220014e807e646945a8) --- config/orte_check_slurm.m4 | 10 ++++++++ orte/mca/plm/slurm/configure.m4 | 9 +------ orte/mca/plm/slurm/help-plm-slurm.txt | 15 ----------- orte/mca/plm/slurm/plm_slurm_module.c | 19 -------------- orte/mca/ras/base/ras_base_node.c | 36 ++++++++++++++++++++++++--- 5 files changed, 43 insertions(+), 46 deletions(-) diff --git a/config/orte_check_slurm.m4 b/config/orte_check_slurm.m4 index b59e5f5804b..ee5cd02cce7 100644 --- a/config/orte_check_slurm.m4 +++ b/config/orte_check_slurm.m4 @@ -13,6 +13,7 @@ # Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2016 Los Alamos National Security, LLC. All rights # reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -68,6 +69,15 @@ AC_DEFUN([ORTE_CHECK_SLURM],[ [orte_check_slurm_happy="yes"], [orte_check_slurm_happy="no"])]) + # check to see if this is a Cray nativized slurm env. + + slurm_cray_env=0 + OPAL_CHECK_ALPS([orte_slurm_cray], + [slurm_cray_env=1]) + + AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env], + [defined to 1 if slurm cray env, 0 otherwise]) + OPAL_SUMMARY_ADD([[Resource Managers]],[[Slurm]],[$1],[$orte_check_slurm_happy]) fi diff --git a/orte/mca/plm/slurm/configure.m4 b/orte/mca/plm/slurm/configure.m4 index 6aabe477107..fa7267e531d 100644 --- a/orte/mca/plm/slurm/configure.m4 +++ b/orte/mca/plm/slurm/configure.m4 @@ -13,6 +13,7 @@ # Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011-2016 Los Alamos National Security, LLC. # All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -38,12 +39,4 @@ AC_DEFUN([MCA_orte_plm_slurm_CONFIG],[ AC_SUBST([plm_slurm_LDFLAGS]) AC_SUBST([plm_slurm_LIBS]) - # check to see if this is a Cray nativized slurm env. - - slurm_cray_env=0 - OPAL_CHECK_ALPS([plm_slurm_cray], - [slurm_cray_env=1]) - - AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env], - [defined to 1 if slurm cray env, 0 otherwise]) ])dnl diff --git a/orte/mca/plm/slurm/help-plm-slurm.txt b/orte/mca/plm/slurm/help-plm-slurm.txt index 837c3e88a89..9cc5af5b444 100644 --- a/orte/mca/plm/slurm/help-plm-slurm.txt +++ b/orte/mca/plm/slurm/help-plm-slurm.txt @@ -49,18 +49,3 @@ are running. Please consult with your system administrator about obtaining such support. -[no-local-support] -The SLURM process starter cannot start processes local to -mpirun when executing under a Cray environment. The problem -is that mpirun is not itself a child of a slurmd daemon. Thus, -any processes mpirun itself starts will inherit incorrect -RDMA credentials. - -Your application will be mapped and run (assuming adequate -resources) on the remaining allocated nodes. If adequate -resources are not available, you will need to exit and obtain -a larger allocation. - -This situation will be fixed in a future release. Meantime, -you can turn "off" this warning by setting the plm_slurm_warning -MCA param to 0. diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index fc62b057f3b..4c5e7e11672 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -193,25 +193,6 @@ static void launch_daemons(int fd, short args, void *cbdata) "%s plm:slurm: LAUNCH DAEMONS CALLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); -#if SLURM_CRAY_ENV - /* if we are in a Cray-SLURM environment, then we cannot - * launch procs local to the HNP. The problem - * is the MPI processes launched on the head node (where the - * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon - * (mpirun) which is not a child of a slurmd daemon. This - * means that any RDMA credentials obtained via the odls/alps - * local launcher are incorrect. So warn the user and set - * the envar for no_schedule_local if mpirun is not on a - * system management node (i.e. is part of the allocation) - * and the "no_use_local" flag hasn't been set */ - if (mca_plm_slurm_component.slurm_warning_msg && - (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) { - orte_show_help("help-plm-slurm.txt", "no-local-support", true); - ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); - mca_plm_slurm_component.slurm_warning_msg = false; // only do this once - } -#endif - /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index ae11c44db59..5fd3b3dda26 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -30,6 +30,7 @@ #include "opal/util/if.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmaps/base/base.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -46,7 +47,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) int rc, i; orte_node_t *node, *hnp_node, *nptr; char *ptr; - bool hnp_alone = true; + bool hnp_alone = true, skiphnp = false; orte_attribute_t *kv; char **alias=NULL, **nalias; @@ -77,6 +78,33 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) /* get the hnp node's info */ hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); +#if SLURM_CRAY_ENV + /* if we are in a Cray-SLURM environment, then we cannot + * launch procs local to the HNP. The problem + * is the MPI processes launched on the head node (where the + * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon + * (mpirun) which is not a child of a slurmd daemon. This + * means that any RDMA credentials obtained via the odls/alps + * local launcher are incorrect. Test for this condition. If + * found, then take steps to ensure we launch a daemon on + * the same node as mpirun and that it gets used to fork + * local procs instead of mpirun so they get the proper + * credential */ + if (NULL != hnp_node) { + OPAL_LIST_FOREACH(node, nodes, orte_node_t) { + if (orte_ifislocal(node->name)) { + orte_hnp_is_allocated = true; + break; + } + } + if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { + hnp_node->name = strdup("mpirun"); + skiphnp = true; + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); + } + } +#endif + /* cycle through the list */ while (NULL != (item = opal_list_remove_first(nodes))) { @@ -86,7 +114,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) * first position since it is the first one entered. We need to check to see * if this node is the same as the HNP's node so we don't double-enter it */ - if (NULL != hnp_node && orte_ifislocal(node->name)) { + if (!skiphnp && NULL != hnp_node && orte_ifislocal(node->name)) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:node_insert updating HNP [%s] info to %ld slots", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -189,7 +217,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) * ensure we don't have any domain info in the node record * for the hnp */ - if (!orte_have_fqdn_allocation && !hnp_alone) { + if (NULL != hnp_node && !orte_have_fqdn_allocation && !hnp_alone) { if (NULL != (ptr = strchr(hnp_node->name, '.'))) { *ptr = '\0'; } From d2a5a9874350025c7dadb69cb62ad386608c9271 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 8 May 2017 16:41:01 +0900 Subject: [PATCH 03/29] orte/util: fix vpids parsing in orte_util_nidmap_parse() Signed-off-by: Gilles Gouaillardet (cherry picked from commit e101f2b3f995426b147a826fc82a84e07e913adf) --- orte/util/nidmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 02ef5b8e7d8..f62b9d91f87 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -13,7 +13,7 @@ * Copyright (c) 2012-2014 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -756,8 +756,8 @@ int orte_util_nidmap_parse(char *regex) opal_list_append(&dids, &rng->super); /* check for a count */ if (NULL != (ptr = strchr(dvpids[n], '('))) { + dvpids[n][strlen(dvpids[n])-1] = '\0'; // remove trailing paren *ptr = '\0'; - dvpids[n][strlen(dvpids[n])-2] = '\0'; // remove trailing paren ++ptr; rng->cnt = strtoul(ptr, NULL, 10); } From 2387024e20796690f84b150b867d83b9e6155697 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 25 Apr 2017 21:24:21 -0700 Subject: [PATCH 04/29] Do not pass topologies during tree spawn of daemons as there is no way the HNP can know the backend topologies at that point. Any needed topologies will be sent along with the launch_apps command Do not pass param file MCA params if the user has requested that no param files be read - required when trying to avoid launch time penalties from large numbers of processes reading default param files. The daemon picks them up and passes them along anyway, so it isn't clear what value we gain from having them all read the defaults Signed-off-by: Ralph Castain (cherry picked from commit 180809f2eface23f34e4432e28e8e0a07202734c) --- orte/mca/plm/base/plm_base_launch_support.c | 134 +++++++++++-------- orte/mca/plm/rsh/plm_rsh_module.c | 15 --- orte/mca/rmaps/base/rmaps_base_support_fns.c | 2 +- orte/util/nidmap.c | 11 +- 4 files changed, 84 insertions(+), 78 deletions(-) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 49890762f2f..fb233fafbf0 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1544,51 +1544,34 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, orte_xterm); } - /* - * Pass along the Aggregate MCA Parameter Sets - */ - /* Add the 'prefix' param */ - tmp_value = NULL; - - loc_id = mca_base_var_find("opal", "mca", "base", "envar_file_prefix"); + loc_id = mca_base_var_find("opal", "mca", "base", "param_files"); if (loc_id < 0) { rc = OPAL_ERR_NOT_FOUND; ORTE_ERROR_LOG(rc); return rc; } + tmp_value = NULL; rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } - if( NULL != tmp_value && NULL != tmp_value[0] ) { - /* Could also use the short version '-tune' - * but being verbose has some value - */ - opal_argv_append(argc, argv, "-mca"); - opal_argv_append(argc, argv, "mca_base_envar_file_prefix"); - opal_argv_append(argc, argv, tmp_value[0]); + if (NULL != tmp_value && NULL != tmp_value[0]) { + rc = strcmp(tmp_value[0], "none"); + } else { + rc = 1; } - tmp_value2 = NULL; - loc_id = mca_base_var_find("opal", "mca", "base", "param_file_prefix"); - mca_base_var_get_value(loc_id, &tmp_value2, NULL, NULL); - if( NULL != tmp_value2 && NULL != tmp_value2[0] ) { - /* Could also use the short version '-am' - * but being verbose has some value + if (0 != rc) { + /* + * Pass along the Aggregate MCA Parameter Sets */ - opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(argc, argv, "mca_base_param_file_prefix"); - opal_argv_append(argc, argv, tmp_value2[0]); - orte_show_help("help-plm-base.txt", "deprecated-amca", true); - } - - if ((NULL != tmp_value && NULL != tmp_value[0]) - || (NULL != tmp_value2 && NULL != tmp_value2[0])) { - /* Add the 'path' param */ + /* Add the 'prefix' param */ tmp_value = NULL; - loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path"); + + loc_id = mca_base_var_find("opal", "mca", "base", "envar_file_prefix"); if (loc_id < 0) { + rc = OPAL_ERR_NOT_FOUND; ORTE_ERROR_LOG(rc); return rc; } @@ -1598,39 +1581,76 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, return rc; } if( NULL != tmp_value && NULL != tmp_value[0] ) { - opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(argc, argv, "mca_base_param_file_path"); + /* Could also use the short version '-tune' + * but being verbose has some value + */ + opal_argv_append(argc, argv, "-mca"); + opal_argv_append(argc, argv, "mca_base_envar_file_prefix"); opal_argv_append(argc, argv, tmp_value[0]); } - /* Add the 'path' param */ - opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); - opal_argv_append(argc, argv, "mca_base_param_file_path_force"); - - tmp_value = NULL; - loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path_force"); - if (loc_id < 0) { - rc = OPAL_ERR_NOT_FOUND; - ORTE_ERROR_LOG(rc); - return rc; - } - rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL); - if (OPAL_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - return rc; + tmp_value2 = NULL; + loc_id = mca_base_var_find("opal", "mca", "base", "param_file_prefix"); + mca_base_var_get_value(loc_id, &tmp_value2, NULL, NULL); + if( NULL != tmp_value2 && NULL != tmp_value2[0] ) { + /* Could also use the short version '-am' + * but being verbose has some value + */ + opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(argc, argv, "mca_base_param_file_prefix"); + opal_argv_append(argc, argv, tmp_value2[0]); + orte_show_help("help-plm-base.txt", "deprecated-amca", true); } - if( NULL == tmp_value || NULL == tmp_value[0] ) { - /* Get the current working directory */ - tmp_force = (char *) malloc(sizeof(char) * OPAL_PATH_MAX); - if (NULL == getcwd(tmp_force, OPAL_PATH_MAX)) { - free(tmp_force); - tmp_force = strdup(""); + + if ((NULL != tmp_value && NULL != tmp_value[0]) + || (NULL != tmp_value2 && NULL != tmp_value2[0])) { + /* Add the 'path' param */ + tmp_value = NULL; + loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path"); + if (loc_id < 0) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + if( NULL != tmp_value && NULL != tmp_value[0] ) { + opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(argc, argv, "mca_base_param_file_path"); + opal_argv_append(argc, argv, tmp_value[0]); } - opal_argv_append(argc, argv, tmp_force); - free(tmp_force); - } else { - opal_argv_append(argc, argv, tmp_value[0]); + /* Add the 'path' param */ + opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(argc, argv, "mca_base_param_file_path_force"); + + tmp_value = NULL; + loc_id = mca_base_var_find("opal", "mca", "base", "param_file_path_force"); + if (loc_id < 0) { + rc = OPAL_ERR_NOT_FOUND; + ORTE_ERROR_LOG(rc); + return rc; + } + rc = mca_base_var_get_value(loc_id, &tmp_value, NULL, NULL); + if (OPAL_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + if( NULL == tmp_value || NULL == tmp_value[0] ) { + /* Get the current working directory */ + tmp_force = (char *) malloc(sizeof(char) * OPAL_PATH_MAX); + if (NULL == getcwd(tmp_force, OPAL_PATH_MAX)) { + free(tmp_force); + tmp_force = strdup(""); + } + + opal_argv_append(argc, argv, tmp_force); + free(tmp_force); + } else { + opal_argv_append(argc, argv, tmp_value[0]); + } } } diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index ac1f501c390..9164f5870fa 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -800,15 +800,6 @@ static int remote_spawn(opal_buffer_t *launch) goto cleanup; } - /* extract and update the daemon map */ - if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* since we are tree-spawning, we need to update the routing plan */ - orte_routed.update_routing_plan(NULL); - /* get the updated routing list */ rtmod = orte_rml.get_routed(orte_coll_conduit); OBJ_CONSTRUCT(&coll, opal_list_t); @@ -1177,12 +1168,6 @@ static void launch_daemons(int fd, short args, void *cbdata) OBJ_RELEASE(orte_tree_launch_cmd); goto cleanup; } - /* construct a nodemap of all daemons we know about */ - if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(orte_tree_launch_cmd))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(orte_tree_launch_cmd); - goto cleanup; - } /* get the orted job data object */ if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 4bc44bf3b0e..6fd1d7cec0e 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -413,7 +413,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr * are getting for an initial map of a job, * then mark all nodes as unmapped */ - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } if (NULL == nd || NULL == nd->daemon || NULL == node->daemon || diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index f62b9d91f87..11bd366d344 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -447,7 +447,6 @@ int orte_util_nidmap_create(char **regex) asprintf(&tmp2, "%s@%s", nodenames, tmp); free(nodenames); free(tmp); - *regex = tmp2; return ORTE_SUCCESS; } @@ -760,9 +759,10 @@ int orte_util_nidmap_parse(char *regex) *ptr = '\0'; ++ptr; rng->cnt = strtoul(ptr, NULL, 10); + } else { + rng->cnt = 1; } - /* convert the number - since it might be a range, - * save the remainder pointer */ + /* convert the number */ rng->vpid = strtoul(dvpids[n], NULL, 10); } opal_argv_free(dvpids); @@ -797,16 +797,17 @@ int orte_util_nidmap_parse(char *regex) nd->daemon = proc; } ++cnt; - if (cnt == rng->cnt) { + if (rng->cnt <= cnt) { rng = (orte_regex_range_t*)opal_list_get_next(&rng->super); if (NULL == rng) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } + cnt = 0; } } - /* unpdate num procs */ + /* update num procs */ if (orte_process_info.num_procs != daemons->num_procs) { orte_process_info.num_procs = daemons->num_procs; /* need to update the routing plan */ From 47c4f88180b656697fcd62455d54afc87bc46ad5 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 10 May 2017 11:49:27 +0900 Subject: [PATCH 05/29] pmix2x: plug a misc memory leak Signed-off-by: Gilles Gouaillardet (cherry picked from commit 026f3dd2dd88b8abb45a842bd42145f683689d28) --- opal/mca/pmix/pmix2x/pmix2x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/mca/pmix/pmix2x/pmix2x.c b/opal/mca/pmix/pmix2x/pmix2x.c index efa8047d266..9cd36f1001e 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.c +++ b/opal/mca/pmix/pmix2x/pmix2x.c @@ -1402,8 +1402,8 @@ static void opdes(pmix2x_opcaddy_t *p) if (NULL != p->error_procs) { PMIX_PROC_FREE(p->error_procs, p->nerror_procs); } - if (NULL != p->info) { - PMIX_INFO_FREE(p->info, p->sz); + if (0 < p->ninfo) { + PMIX_INFO_FREE(p->info, p->ninfo); } if (NULL != p->apps) { PMIX_APP_FREE(p->apps, p->sz); From 8f824865d8851fad3530592ae51146ca1ee9426b Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 9 May 2017 16:06:15 -0700 Subject: [PATCH 06/29] Fix the nidmap computation to deal with hetero nodes Signed-off-by: Ralph Castain (cherry picked from commit 442e307a6eb42803fcb5c14d5af64e15f5591d4b) --- orte/mca/rmaps/round_robin/rmaps_rr.c | 8 +- orte/mca/rmaps/round_robin/rmaps_rr_mappers.c | 9 ++ orte/util/nidmap.c | 139 +++++++++--------- 3 files changed, 89 insertions(+), 67 deletions(-) diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 863e959e338..06b621383c7 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -12,7 +12,7 @@ * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -108,6 +108,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np", true, jdata->num_apps, NULL); rc = ORTE_ERR_SILENT; + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } @@ -118,6 +119,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ @@ -236,10 +238,12 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) true, "mapping", orte_rmaps_base_print_mapping(jdata->map->mapping)); rc = ORTE_ERR_SILENT; + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } @@ -249,6 +253,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); return rc; } @@ -270,6 +275,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) return ORTE_SUCCESS; error: + opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); while(NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index c0b08e2a033..8c2c9925e49 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -493,6 +493,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } } @@ -510,6 +511,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, if (NULL == node->topology || NULL == node->topology->topo) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } start = 0; @@ -548,6 +550,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, /* add this node to the map, if reqd */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); ORTE_ERROR_LOG(idx); return idx; } @@ -566,15 +569,18 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, /* get the hwloc object */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (i+start) % nobjs, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_NOT_FOUND; } if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) { orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true, orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj), orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; @@ -601,12 +607,14 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { /* if we were explicitly told not to oversubscribe, then don't */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } } @@ -621,6 +629,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, if (nprocs_mapped < app->num_procs) { /* usually means there were no objects of the requested type */ + opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_NOT_FOUND; } diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 11bd366d344..ef7509e2a88 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -494,34 +494,50 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) return rc; } - for (n=0; n < orte_node_pool->size; n++) { + /* there is always one topology - our own - so start with it */ + nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + tp = OBJ_NEW(orte_regex_range_t); + tp->t = nptr->topology; + tp->cnt = 1; + opal_list_append(&topos, &tp->super); + + /* likewise, we have slots */ + slt = OBJ_NEW(orte_regex_range_t); + slt->slots = nptr->slots; + slt->cnt = 1; + opal_list_append(&slots, &slt->super); + + /* and flags */ + flg = OBJ_NEW(orte_regex_range_t); + if (ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN)) { + flg->slots = 1; + } else { + flg->slots = 0; + } + flg->cnt = 1; + opal_list_append(&flags, &flg->super); + + for (n=1; n < orte_node_pool->size; n++) { if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { continue; } /* check the #slots */ - if (NULL == slt) { - /* just starting */ + /* is this the next in line */ + if (nptr->slots == slt->slots) { + slt->cnt++; + } else { + /* need to start another range */ slt = OBJ_NEW(orte_regex_range_t); slt->slots = nptr->slots; slt->cnt = 1; opal_list_append(&slots, &slt->super); - } else { - /* is this the next in line */ - if (nptr->slots == slt->slots) { - slt->cnt++; - } else { - /* need to start another range */ - slt = OBJ_NEW(orte_regex_range_t); - slt->slots = nptr->slots; - slt->cnt = 1; - opal_list_append(&slots, &slt->super); - } } /* check the topologies */ - if (NULL == tp) { - /* just starting */ + if (NULL == nptr->topology) { + /* we don't know this topology, likely because + * we don't have a daemon on the node */ tp = OBJ_NEW(orte_regex_range_t); - tp->t = nptr->topology; + tp->t = NULL; tp->cnt = 1; opal_list_append(&topos, &tp->super); } else { @@ -538,8 +554,12 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } /* check the flags */ test = ORTE_FLAG_TEST(nptr, ORTE_NODE_FLAG_SLOTS_GIVEN); - if (NULL == flg) { - /* just starting */ + /* is this the next in line */ + if ((test && 1 == flg->slots) || + (!test && 0 == flg->slots)) { + flg->cnt++; + } else { + /* need to start another range */ flg = OBJ_NEW(orte_regex_range_t); if (test) { flg->slots = 1; @@ -548,22 +568,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } flg->cnt = 1; opal_list_append(&flags, &flg->super); - } else { - /* is this the next in line */ - if ((test && 1 == flg->slots) || - (!test && 0 == flg->slots)) { - flg->cnt++; - } else { - /* need to start another range */ - flg = OBJ_NEW(orte_regex_range_t); - if (test) { - flg->slots = 1; - } else { - flg->slots = 0; - } - flg->cnt = 1; - opal_list_append(&flags, &flg->super); - } } } @@ -581,7 +585,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) OBJ_RELEASE(rng); } OPAL_LIST_DESTRUCT(&slots); - /* pack the string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -640,13 +643,6 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) OBJ_CONSTRUCT(&bucket, opal_buffer_t); while (NULL != (item = opal_list_remove_first(&topos))) { rng = (orte_regex_range_t*)item; - if (NULL == rng->t) { - /* when we pass thru here prior to launching the daemons, we - * won't have topologies for them and so this entry might - * be NULL - protect ourselves */ - OBJ_RELEASE(item); - continue; - } if (NULL == tmp) { asprintf(&tmp, "%d", rng->cnt); } else { @@ -654,28 +650,40 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) free(tmp); tmp = tmp2; } - /* pack this topology string */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(rng); - OPAL_LIST_DESTRUCT(&topos); - OBJ_DESTRUCT(&bucket); - free(tmp); - return rc; - } - /* pack the topology itself */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(rng); - OPAL_LIST_DESTRUCT(&topos); - OBJ_DESTRUCT(&bucket); - free(tmp); - return rc; + if (NULL == rng->t) { + /* need to account for NULL topology */ + tmp2 = NULL; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &tmp2, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } + } else { + /* pack this topology string */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } + /* pack the topology itself */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->topo, 1, OPAL_HWLOC_TOPO))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(rng); + OPAL_LIST_DESTRUCT(&topos); + OBJ_DESTRUCT(&bucket); + free(tmp); + return rc; + } } OBJ_RELEASE(rng); } OPAL_LIST_DESTRUCT(&topos); - /* pack the string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -1029,11 +1037,10 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) goto cleanup; } if (NULL == sig) { - rc = ORTE_ERR_BAD_PARAM; - ORTE_ERROR_LOG(rc); - opal_argv_free(tmp); - OBJ_RELEASE(bptr); - goto cleanup; + /* the nodes in this range have not reported a topology, + * so skip them */ + offset += cnt; + continue; } n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &topo, &n, OPAL_HWLOC_TOPO))) { From 6bfacc32d797fd5f3d4534c6f85d66a9ace6c0be Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 10 May 2017 11:26:42 -0700 Subject: [PATCH 07/29] Sigh - remove debug Signed-off-by: Ralph Castain (cherry picked from commit 911961ee21f4cef8cfef1befab1e9b962dd3d5ae) --- orte/mca/rmaps/round_robin/rmaps_rr.c | 7 ------- orte/mca/rmaps/round_robin/rmaps_rr_mappers.c | 9 --------- 2 files changed, 16 deletions(-) diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 06b621383c7..a764e0243f3 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -108,7 +108,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np", true, jdata->num_apps, NULL); rc = ORTE_ERR_SILENT; - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } @@ -119,7 +118,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ @@ -238,12 +236,10 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) true, "mapping", orte_rmaps_base_print_mapping(jdata->map->mapping)); rc = ORTE_ERR_SILENT; - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); goto error; } @@ -253,7 +249,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); return rc; } @@ -275,7 +270,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) return ORTE_SUCCESS; error: - opal_output(0, "RMAPS RR FAILING: %s:%d", __FILE__, __LINE__); while(NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } @@ -287,4 +281,3 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) orte_rmaps_base_module_t orte_rmaps_round_robin_module = { orte_rmaps_rr_map }; - diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index 8c2c9925e49..c0b08e2a033 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -493,7 +493,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } } @@ -511,7 +510,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, if (NULL == node->topology || NULL == node->topology->topo) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } start = 0; @@ -550,7 +548,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, /* add this node to the map, if reqd */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); ORTE_ERROR_LOG(idx); return idx; } @@ -569,18 +566,15 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, /* get the hwloc object */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (i+start) % nobjs, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_NOT_FOUND; } if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) { orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true, orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj), orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; @@ -607,14 +601,12 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { /* if we were explicitly told not to oversubscribe, then don't */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_SILENT; } } @@ -629,7 +621,6 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, if (nprocs_mapped < app->num_procs) { /* usually means there were no objects of the requested type */ - opal_output(0, "RMAPS RR NO-SPAN FAILING: %s:%d", __FILE__, __LINE__); return ORTE_ERR_NOT_FOUND; } From a6a9a67926085c9e3f9c7dc3f6cf22725e97111c Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 10 May 2017 12:40:02 -0700 Subject: [PATCH 08/29] Add verbose output to nidmap code for debugging as this is a new, and sometimes fragile, feature Signed-off-by: Ralph Castain (cherry picked from commit 55f4b825af506dad3caf7eeacf9b7ab1782e3fc3) --- orte/runtime/orte_init.c | 2 ++ orte/util/nidmap.c | 43 ++++++++++++++++++++++++++++++++++++++++ orte/util/nidmap.h | 2 ++ 3 files changed, 47 insertions(+) diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index 03eaab0c0f6..a4e4bee5969 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -47,6 +47,7 @@ #include "orte/mca/schizo/base/base.h" #include "orte/util/listener.h" #include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" #include "orte/util/proc_info.h" #include "orte/util/error_strings.h" #include "orte/orted/pmix/pmix_server.h" @@ -201,6 +202,7 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { /* let the pmix server register params */ pmix_server_register_params(); + orte_util_nidmap_init(); } /* open the SCHIZO framework as everyone needs it, and the diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index ef7509e2a88..1243e1dd8ff 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -74,6 +74,27 @@ #include "orte/util/nidmap.h" +static int orte_nidmap_verbosity = -1; +static int orte_nidmap_output = -1; + +void orte_util_nidmap_init(void) +{ + orte_nidmap_verbosity = -1; + (void) mca_base_var_register ("orte", "orte", NULL, "nidmap_verbose", + "Verbosity level for ORTE debug messages in the nidmap utilities", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, + &orte_nidmap_verbosity); + + /* set default output */ + orte_nidmap_output = opal_output_open(NULL); + + /* open up the verbose output for debugging */ + if (0 < orte_nidmap_verbosity) { + opal_output_set_verbosity(orte_nidmap_output, orte_nidmap_verbosity); + } +} + int orte_util_build_daemon_nidmap(void) { int i; @@ -585,6 +606,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) OBJ_RELEASE(rng); } OPAL_LIST_DESTRUCT(&slots); + opal_output_verbose(1, orte_nidmap_output, + "%s SLOT ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); /* pack the string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -610,6 +634,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) OPAL_LIST_DESTRUCT(&flags); /* pack the string */ + opal_output_verbose(1, orte_nidmap_output, + "%s FLAG ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); return rc; @@ -652,6 +679,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } if (NULL == rng->t) { /* need to account for NULL topology */ + opal_output_verbose(1, orte_nidmap_output, + "%s PACKING NULL TOPOLOGY", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); tmp2 = NULL; if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &tmp2, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -662,6 +692,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) return rc; } } else { + opal_output_verbose(1, orte_nidmap_output, + "%s PACKING TOPOLOGY: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rng->t->sig); /* pack this topology string */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &rng->t->sig, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -685,6 +718,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } OPAL_LIST_DESTRUCT(&topos); /* pack the string */ + opal_output_verbose(1, orte_nidmap_output, + "%s TOPOLOGY ASSIGNMENTS: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&bucket); @@ -1011,6 +1047,9 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) if (NULL == bptr) { /* our topology is first in the array */ t2 = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); + opal_output_verbose(1, orte_nidmap_output, + "%s ASSIGNING ALL TOPOLOGIES TO: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), t2->sig); for (n=0; n < orte_node_pool->size; n++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { if (NULL == node->topology) { @@ -1077,6 +1116,10 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n+offset))) { continue; } + opal_output_verbose(1, orte_nidmap_output, + "%s ASSIGNING NODE %s WITH TOPO: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, t2->sig); if (NULL == node->topology) { OBJ_RETAIN(t2); node->topology = t2; diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index 521cc352c0e..3acc29b9277 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -44,6 +44,8 @@ BEGIN_C_DECLS #define ORTE_NON_CONTIG_NODE_CMD 0x02 +ORTE_DECLSPEC void orte_util_nidmap_init(void); + ORTE_DECLSPEC int orte_util_nidmap_create(char **regex); ORTE_DECLSPEC int orte_util_nidmap_parse(char *regex); From c4aae19ac84d56637fd14fcb3d4ff2c49c8cb44e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 10 May 2017 15:16:41 -0700 Subject: [PATCH 09/29] Finally fix the problem - the key was knowing there were more than 2 topologies involved, and that the HNP is not allocated. Give up on being cute and just search the darned list of topologies - there won't be that many, and if there are (so the scan takes awhile), then too bad. Signed-off-by: Ralph Castain (cherry picked from commit f47124e4d387c37e95c814e9373c2e0c398130c1) --- orte/util/nidmap.c | 91 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 24 deletions(-) diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 1243e1dd8ff..3b2ec9bdfeb 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -483,6 +483,7 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) orte_node_t *nptr; int rc; uint8_t ui8; + orte_topology_t *ortetopo; /* setup the list of results */ OBJ_CONSTRUCT(&slots, opal_list_t); @@ -515,13 +516,40 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) return rc; } - /* there is always one topology - our own - so start with it */ + /* handle the topologies - as the most common case by far + * is to have homogeneous topologies, we only send them + * if something is different. We know that the HNP is + * the first topology, and that any differing topology + * on the compute nodes must follow. So send the topologies + * if and only if: + * + * (a) the HNP is being used to house application procs and + * there is more than one topology on our list; or + * + * (b) the HNP is not being used, but there are more than + * two topologies on our list, thus indicating that + * there are multiple topologies on the compute nodes + */ nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - tp = OBJ_NEW(orte_regex_range_t); - tp->t = nptr->topology; - tp->cnt = 1; + if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { + /* assign a NULL topology so we still account for our presence, + * but don't cause us to send topology info when not needed */ + tp = OBJ_NEW(orte_regex_range_t); + tp->t = NULL; + tp->cnt = 1; + } else { + /* there is always one topology - our own - so start with it */ + tp = OBJ_NEW(orte_regex_range_t); + tp->t = nptr->topology; + tp->cnt = 1; + } opal_list_append(&topos, &tp->super); + opal_output_verbose(5, orte_nidmap_output, + "%s STARTING WITH TOPOLOGY FOR NODE %s: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + nptr->name, (NULL == tp->t) ? "NULL" : tp->t->sig); + /* likewise, we have slots */ slt = OBJ_NEW(orte_regex_range_t); slt->slots = nptr->slots; @@ -554,22 +582,33 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) opal_list_append(&slots, &slt->super); } /* check the topologies */ - if (NULL == nptr->topology) { + if (NULL != tp->t && NULL == nptr->topology) { /* we don't know this topology, likely because * we don't have a daemon on the node */ tp = OBJ_NEW(orte_regex_range_t); tp->t = NULL; tp->cnt = 1; + opal_output_verbose(5, orte_nidmap_output, + "%s ADD TOPOLOGY FOR NODE %s: NULL", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nptr->name); opal_list_append(&topos, &tp->super); } else { /* is this the next in line */ if (tp->t == nptr->topology) { tp->cnt++; + opal_output_verbose(5, orte_nidmap_output, + "%s CONTINUE TOPOLOGY RANGE (%d) WITH NODE %s: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + tp->cnt, nptr->name, tp->t->sig); } else { /* need to start another range */ tp = OBJ_NEW(orte_regex_range_t); tp->t = nptr->topology; tp->cnt = 1; + opal_output_verbose(5, orte_nidmap_output, + "%s STARTING NEW TOPOLOGY RANGE WITH NODE %s: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + nptr->name, tp->t->sig); opal_list_append(&topos, &tp->super); } } @@ -645,31 +684,32 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) free(tmp); } - /* handle the topologies - as the most common case by far - * is to have homogeneous topologies, we only send them - * if something is different. We know that the HNP is - * the first topology, and that any differing topology - * on the compute nodes must follow. So send the topologies - * if and only if: - * - * (a) the HNP is being used to house application procs and - * there is more than one topology on our list; or - * - * (b) the HNP is not being used, but there are more than - * two topologies on our list, thus indicating that - * there are multiple topologies on the compute nodes - */ - if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { - /* remove the first topo on the list */ - item = opal_list_remove_first(&topos); - OBJ_RELEASE(item); + /* don't try to be cute - there aren't going to be that many + * topologies, so just scan the list and see if they are the + * same, excluding any NULL values */ + ortetopo = NULL; + test = false; + OPAL_LIST_FOREACH(rng, &topos, orte_regex_range_t) { + if (NULL == rng->t) { + continue; + } + if (NULL == ortetopo) { + ortetopo = rng->t; + } else if (0 != strcmp(ortetopo->sig, rng->t->sig)) { + /* we have a difference, so send them */ + test = true; + } } tmp = NULL; - if (1 < opal_list_get_size(&topos)) { + if (test) { opal_buffer_t bucket, *bptr; OBJ_CONSTRUCT(&bucket, opal_buffer_t); while (NULL != (item = opal_list_remove_first(&topos))) { rng = (orte_regex_range_t*)item; + opal_output_verbose(5, orte_nidmap_output, + "%s PASSING TOPOLOGY %s RANGE %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == rng->t) ? "NULL" : rng->t->sig, rng->cnt); if (NULL == tmp) { asprintf(&tmp, "%d", rng->cnt); } else { @@ -738,6 +778,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) } OBJ_DESTRUCT(&bucket); } else { + opal_output_verbose(1, orte_nidmap_output, + "%s NOT PASSING TOPOLOGIES", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* need to pack the NULL just to terminate the region */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &tmp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); From ff44d69314159296b18ca6ffd06642420b49e6f2 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 11 May 2017 06:50:59 -0700 Subject: [PATCH 10/29] When a daemon force-terminates, we don't get the show_help message it was trying to send because the message is at a lower priority than the termination event. Resolve this by putting the oob in its own progress thread. Also, use only that one thread by default - if someone needs more progress threads in the OOB, they can use the MCA param to get them. Signed-off-by: Ralph Castain (cherry picked from commit 9164afbb08baedca12f8473950a736a77e21aefc) --- orte/mca/oob/base/oob_base_frame.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/orte/mca/oob/base/oob_base_frame.c b/orte/mca/oob/base/oob_base_frame.c index 56ec2ad8fc0..be5c745e507 100644 --- a/orte/mca/oob/base/oob_base_frame.c +++ b/orte/mca/oob/base/oob_base_frame.c @@ -55,11 +55,7 @@ orte_oob_base_t orte_oob_base = {0}; static int orte_oob_base_register(mca_base_register_flag_t flags) { - if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) { - orte_oob_base.num_threads = 0; - } else { - orte_oob_base.num_threads = 8; - } + orte_oob_base.num_threads = 0; (void)mca_base_var_register("orte", "oob", "base", "num_progress_threads", "Number of independent progress OOB messages for each interface", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -95,6 +91,10 @@ static int orte_oob_base_close(void) OBJ_RELEASE(cli); } + if (!ORTE_PROC_IS_APP && !ORTE_PROC_IS_TOOL) { + opal_progress_thread_finalize("OOB-BASE"); + } + /* destruct our internal lists */ OBJ_DESTRUCT(&orte_oob_base.actives); @@ -122,7 +122,11 @@ static int orte_oob_base_open(mca_base_open_flag_t flags) opal_hash_table_init(&orte_oob_base.peers, 128); OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t); - orte_oob_base.ev_base = orte_event_base; + if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) { + orte_oob_base.ev_base = orte_event_base; + } else { + orte_oob_base.ev_base = opal_progress_thread_init("OOB-BASE"); + } #if OPAL_ENABLE_FT_CR == 1 From 71f031d447ef1a09f9043f1b79534e1b1d3bdfff Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 12 May 2017 08:21:52 -0700 Subject: [PATCH 11/29] Fix total_slots_allocated computation On unmanaged allocations, we need to update the total_slots_allocated once the daemons have been launched and "discovered" their topology Signed-off-by: Ralph Castain (cherry picked from commit 29e083bffda3fceaddfe209fd38ff6a7e20433ee) --- orte/mca/plm/base/plm_base_launch_support.c | 2 ++ orte/mca/rmaps/base/rmaps_base_map_job.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index fb233fafbf0..677535aacf6 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -150,6 +150,7 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata) if (!orte_managed_allocation) { if (NULL != orte_set_slots && 0 != strncmp(orte_set_slots, "none", strlen(orte_set_slots))) { + caddy->jdata->total_slots_alloc = 0; for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; @@ -160,6 +161,7 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots)); orte_plm_base_set_slots(node); } + caddy->jdata->total_slots_alloc += node->slots; } } } diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 615a485bca3..8254bcfaf16 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -520,7 +520,9 @@ void orte_rmaps_base_display_map(orte_job_t *jdata) } } } else { - opal_output(orte_clean_output, " Data for JOB %s offset %s", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset)); + opal_output(orte_clean_output, " Data for JOB %s offset %s Total slots allocated %lu", + ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset), + (long unsigned)jdata->total_slots_alloc); opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP); if (orte_xml_output) { fprintf(orte_xml_fp, "%s\n", output); From bcf00d06001014ce62ed26f58aa901e99004a54b Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 12 May 2017 08:01:16 -0700 Subject: [PATCH 12/29] Fix --nolocal Fix the --nolocal option by ensuring we always check/remove the HNP from the list of available nodes if the flag is set Ensure that the HNP node is included as available when nothing else is given Signed-off-by: Ralph Castain (cherry picked from commit 45bbd598c113e9f92b682ad56f53f05fd27eec2a) --- orte/mca/ras/base/ras_base_allocate.c | 2 + orte/mca/rmaps/base/rmaps_base_support_fns.c | 61 +++++++++++--------- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/orte/mca/ras/base/ras_base_allocate.c b/orte/mca/ras/base/ras_base_allocate.c index 77c9e37ab08..0cf4eefcd4e 100644 --- a/orte/mca/ras/base/ras_base_allocate.c +++ b/orte/mca/ras/base/ras_base_allocate.c @@ -408,6 +408,8 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata) node->slots_max = 0; node->slots = 1; opal_list_append(&nodes, &node->super); + /* mark the HNP as "allocated" since we have nothing else to use */ + orte_hnp_is_allocated = true; /* store the results in the global resource pool - this removes the * list items diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 6fd1d7cec0e..b29537bb648 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -341,28 +341,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr } addknown: - /* if the hnp was allocated, include it unless flagged not to */ - if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { - if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { - OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, - "HNP IS MARKED NO_USE")); - /* clear this for future use, but don't include it */ - node->state = ORTE_NODE_STATE_UP; - } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) { - OBJ_RETAIN(node); - if (initial_map) { - /* if this is the first app_context we - * are getting for an initial map of a job, - * then mark all nodes as unmapped - */ - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); - } - opal_list_append(allocated_nodes, &node->super); - } - } - } - /* add everything in the node pool that can be used - add them * in daemon order, which may be different than the order in the * node pool. Since an empty list is passed into us, the list at @@ -370,8 +348,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr * node obviously has a daemon on it (us!) */ if (0 == opal_list_get_size(allocated_nodes)) { - /* the list is empty */ - nd = NULL; + /* the list is empty - if the HNP is allocated, then add it */ + if (orte_hnp_is_allocated) { + nd = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + opal_list_append(allocated_nodes, &nd->super); + } else { + nd = NULL; + } } else { nd = (orte_node_t*)opal_list_get_last(allocated_nodes); } @@ -487,10 +470,23 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr } else { item = opal_list_get_first(allocated_nodes); while (item != opal_list_get_end(allocated_nodes)) { + node = (orte_node_t*)item; + opal_output(0, "CHECKING NODE %s", node->name); /** save the next pointer in case we remove this node */ next = opal_list_get_next(item); + /* if the hnp was not allocated, or flagged not to be used, + * then remove it here */ + if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) { + node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + if (node == (orte_node_t*)item) { + opal_output(0, "REMOVING HNP NODE"); + opal_list_remove_item(allocated_nodes, item); + OBJ_RELEASE(item); /* "un-retain" it */ + item = next; + continue; + } + } /** check to see if this node is fully used - remove if so */ - node = (orte_node_t*)item; if (0 != node->slots_max && node->slots_inuse > node->slots_max) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Removing node %s: max %d inuse %d", @@ -498,7 +494,10 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr node->name, node->slots_max, node->slots_inuse)); opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ - } else if (node->slots <= node->slots_inuse && + item = next; + continue; + } + if (node->slots <= node->slots_inuse && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { /* remove the node as fully used */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, @@ -507,14 +506,20 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr node->name, node->slots, node->slots_inuse)); opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ - } else if (node->slots > node->slots_inuse) { + item = next; + continue; + } + if (node->slots > node->slots_inuse) { /* add the available slots */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s node %s has %d slots available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots - node->slots_inuse)); num_slots += node->slots - node->slots_inuse; - } else if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { + item = next; + continue; + } + if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { /* nothing needed to do here - we don't add slots to the * count as we don't have any available. Just let the mapper * do what it needs to do to meet the request From 964201f85af9937ce867618a087ed182b8792e3e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 12 May 2017 12:41:36 -0700 Subject: [PATCH 13/29] Remove debug Signed-off-by: Ralph Castain (cherry picked from commit b527c40dae73d122243855e1b55a468df77c4f40) --- orte/mca/rmaps/base/rmaps_base_support_fns.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index b29537bb648..b9003c93f59 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -471,7 +471,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr item = opal_list_get_first(allocated_nodes); while (item != opal_list_get_end(allocated_nodes)) { node = (orte_node_t*)item; - opal_output(0, "CHECKING NODE %s", node->name); /** save the next pointer in case we remove this node */ next = opal_list_get_next(item); /* if the hnp was not allocated, or flagged not to be used, @@ -479,7 +478,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) { node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); if (node == (orte_node_t*)item) { - opal_output(0, "REMOVING HNP NODE"); opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ item = next; From 612cd66ddbc2ed021d52094a8a61fe6278874dd3 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 8 May 2017 16:07:13 +0900 Subject: [PATCH 14/29] odls: fix handling of the orte fork agent Signed-off-by: Gilles Gouaillardet (cherry picked from commit 16fc0996e612695a6e66b7c9253f85bb521847f6) --- orte/mca/odls/alps/odls_alps_module.c | 4 +++- orte/mca/odls/default/odls_default_module.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/orte/mca/odls/alps/odls_alps_module.c b/orte/mca/odls/alps/odls_alps_module.c index 9d17521b440..6e5f09a5193 100644 --- a/orte/mca/odls/alps/odls_alps_module.c +++ b/orte/mca/odls/alps/odls_alps_module.c @@ -18,6 +18,8 @@ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Rutgers, The State University of New Jersey. * All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * * $COPYRIGHT$ * @@ -452,7 +454,7 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd) } } - execve(cd->app->app, cd->argv, cd->env); + execve(cd->cmd, cd->argv, cd->env); send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", "execve error", orte_process_info.nodename, cd->app->app, strerror(errno)); diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index c95946d4193..6eb4f4280f5 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -18,6 +18,8 @@ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Rutgers, The State University of New Jersey. * All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * * $COPYRIGHT$ * @@ -431,7 +433,7 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd) } /* Exec the new executable */ - execve(cd->app->app, cd->argv, cd->env); + execve(cd->cmd, cd->argv, cd->env); getcwd(dir, sizeof(dir)); send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "execve error", From 0e205d4e06b7cbfa77ae729a6583d0933cc08a1a Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 12 May 2017 16:16:47 -0700 Subject: [PATCH 15/29] Add debug verbosity to the orte data server and pmix pub/lookup functions Start updating the various mappers to the new procedure. Remove the stale lama component as it is now very out-of-date. Bring round_robin and PPR online, and modify the mindist component (but cannot test/debug it). Remove unneeded test Fix memory corruption by re-initializing variable to NULL in loop Resolve the race condition identified by @ggouaillardet by resetting the mapped flag within the same event where it was set. There is no need to retain the flag beyond that point as it isn't used again. Add a new job attribute ORTE_JOB_FULLY_DESCRIBED to indicate that all the job information (including locations and binding) is included in the launch message. Thus, the backend daemons do not need to do any map computation for the job. Use this for the seq, rankfile, and mindist mappers until someone decides to update them. Note that this will maintain functionality, but means that users of those three mappers will see large launch messages and less performant scaling than those using the other mappers. Have the mindist module add procs to the job's proc array as it is a fully described module Protect the hnp-not-in-allocation case Per path suggested by Gilles - protect the HNP node when it gets added in the absence of any other allocation or hostfile Signed-off-by: Ralph Castain (cherry picked from commit 657e701c6505e401412b5548c180a22c76832bf9) --- .gitignore | 1 + opal/mca/pmix/base/pmix_base_fns.c | 36 +- orte/mca/odls/base/odls_base_default_fns.c | 203 +- orte/mca/plm/base/plm_base_launch_support.c | 4 +- orte/mca/rmaps/base/Makefile.am | 5 +- orte/mca/rmaps/base/base.h | 3 +- orte/mca/rmaps/base/help-orte-rmaps-base.txt | 12 +- .../rmaps/base/rmaps_base_assign_locations.c | 80 + orte/mca/rmaps/base/rmaps_base_map_job.c | 63 +- orte/mca/rmaps/base/rmaps_base_ranking.c | 685 +++--- orte/mca/rmaps/base/rmaps_base_support_fns.c | 36 +- orte/mca/rmaps/base/rmaps_private.h | 5 +- orte/mca/rmaps/lama/.opal_ignore | 0 orte/mca/rmaps/lama/Makefile.am | 40 - orte/mca/rmaps/lama/help-orte-rmaps-lama.txt | 173 -- orte/mca/rmaps/lama/owner.txt | 7 - orte/mca/rmaps/lama/rmaps_lama.h | 177 -- orte/mca/rmaps/lama/rmaps_lama_component.c | 136 -- orte/mca/rmaps/lama/rmaps_lama_max_tree.c | 1182 ---------- orte/mca/rmaps/lama/rmaps_lama_module.c | 1914 ----------------- orte/mca/rmaps/lama/rmaps_lama_params.c | 878 -------- orte/mca/rmaps/mindist/rmaps_mindist_module.c | 115 +- orte/mca/rmaps/ppr/rmaps_ppr.c | 128 +- orte/mca/rmaps/rank_file/rmaps_rank_file.c | 15 +- orte/mca/rmaps/resilient/rmaps_resilient.c | 44 +- orte/mca/rmaps/rmaps.h | 17 +- orte/mca/rmaps/round_robin/Makefile.am | 4 +- orte/mca/rmaps/round_robin/rmaps_rr.c | 118 +- orte/mca/rmaps/round_robin/rmaps_rr.h | 9 +- orte/mca/rmaps/round_robin/rmaps_rr_assign.c | 171 ++ orte/mca/rmaps/seq/rmaps_seq.c | 6 +- orte/mca/state/base/state_base_fns.c | 2 - orte/mca/state/dvm/state_dvm.c | 2 +- orte/mca/state/hnp/state_hnp.c | 4 + orte/mca/state/novm/state_novm.c | 31 +- orte/orted/pmix/pmix_server_pub.c | 8 + .../data_type_support/orte_dt_packing_fns.c | 120 +- .../data_type_support/orte_dt_unpacking_fns.c | 106 +- orte/runtime/orte_data_server.c | 73 +- orte/test/mpi/Makefile | 9 +- orte/test/mpi/no-disconnect.c | 210 ++ orte/util/attr.c | 2 + orte/util/attr.h | 1 + orte/util/nidmap.c | 220 +- orte/util/nidmap.h | 8 +- 45 files changed, 1886 insertions(+), 5177 deletions(-) create mode 100644 orte/mca/rmaps/base/rmaps_base_assign_locations.c delete mode 100644 orte/mca/rmaps/lama/.opal_ignore delete mode 100644 orte/mca/rmaps/lama/Makefile.am delete mode 100644 orte/mca/rmaps/lama/help-orte-rmaps-lama.txt delete mode 100644 orte/mca/rmaps/lama/owner.txt delete mode 100644 orte/mca/rmaps/lama/rmaps_lama.h delete mode 100644 orte/mca/rmaps/lama/rmaps_lama_component.c delete mode 100644 orte/mca/rmaps/lama/rmaps_lama_max_tree.c delete mode 100644 orte/mca/rmaps/lama/rmaps_lama_module.c delete mode 100644 orte/mca/rmaps/lama/rmaps_lama_params.c create mode 100644 orte/mca/rmaps/round_robin/rmaps_rr_assign.c create mode 100644 orte/test/mpi/no-disconnect.c diff --git a/.gitignore b/.gitignore index b45ab10f922..679b39fb8a6 100644 --- a/.gitignore +++ b/.gitignore @@ -413,6 +413,7 @@ orte/test/mpi/memcached-dummy orte/test/mpi/coll_test orte/test/mpi/badcoll orte/test/mpi/iof +orte/test/mpi/no-disconnect orte/test/system/radix orte/test/system/sigusr_trap diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index bee99bd8062..cb9e4ccf43f 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Mellanox Technologies, Inc. @@ -118,6 +118,12 @@ static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata) cd->active = false; } +static void opcbfunc(int status, void *cbdata) +{ + struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata; + cd->active = false; +} + int opal_pmix_base_exchange(opal_value_t *indat, opal_pmix_pdata_t *outdat, int timeout) @@ -141,11 +147,29 @@ int opal_pmix_base_exchange(opal_value_t *indat, opal_list_append(&ilist, &info->super); /* publish it with "session" scope */ - rc = opal_pmix.publish(&ilist); - OPAL_LIST_DESTRUCT(&ilist); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; + if (NULL == opal_pmix.publish_nb) { + rc = opal_pmix.publish(&ilist); + OPAL_LIST_DESTRUCT(&ilist); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + return rc; + } + } else { + caddy.active = true; + rc = opal_pmix.publish_nb(&ilist, opcbfunc, &caddy); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + OPAL_LIST_DESTRUCT(&ilist); + return rc; + } + while (caddy.active) { + usleep(10); + } + OPAL_LIST_DESTRUCT(&ilist); + if (OPAL_SUCCESS != caddy.status) { + OPAL_ERROR_LOG(caddy.status); + return caddy.status; + } } /* lookup the other side's info - if a non-blocking form diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 30462ac4faa..8ce47c18e3b 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -131,7 +131,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, /* if we couldn't provide the allocation regex on the orted * cmd line, then we need to provide all the info here */ if (!orte_nidmap_communicated) { - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &nidmap))) { ORTE_ERROR_LOG(rc); return rc; } @@ -246,6 +246,22 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, return rc; } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* compute and pack the ppn regex */ + if (ORTE_SUCCESS != (rc = orte_util_nidmap_generate_ppn(jdata, &nidmap))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + free(nidmap); + return rc; + } + free(nidmap); + } + + /* compute and pack the regex of ppn */ + return ORTE_SUCCESS; } @@ -262,13 +278,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, int rc; orte_std_cntr_t cnt; orte_job_t *jdata=NULL, *daemons; - int32_t n, k, m; + int32_t n, k; opal_buffer_t *bptr; - orte_node_t *node; orte_proc_t *pptr, *dmn; orte_app_context_t *app; - bool newmap = false; int8_t flag; + char *ppn; OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, "%s odls:constructing child list", @@ -356,7 +371,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, * the storage */ jdata->jobid = ORTE_JOBID_INVALID; OBJ_RELEASE(jdata); - /* get the correct job object */ + /* get the correct job object - it will be completely filled out */ if (NULL == (jdata = orte_get_job_data_object(*job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; @@ -364,25 +379,65 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } } else { opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); - } - /* ensure the map object is present */ - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - newmap = true; + /* ensure the map object is present */ + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } } - if (orte_no_vm) { - /* if we are operating novm, then mpirun will have sent us - * the complete array of procs - process it */ - for (n=0; n < jdata->procs->size; n++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { - continue; + /* if the job is fully described, then mpirun will have computed + * and sent us the complete array of procs in the orte_job_t, so we + * don't need to do anything more here */ + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + if (!ORTE_PROC_IS_HNP) { + /* extract the ppn regex */ + cnt = 1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ppn, &cnt, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; } - if (ORTE_PROC_STATE_UNDEF == pptr->state) { - /* not ready for use yet */ - continue; + /* populate the node array of the job map and the proc array of + * the job object so we know how many procs are on each node */ + if (ORTE_SUCCESS != (rc = orte_util_nidmap_parse_ppn(jdata, ppn))) { + ORTE_ERROR_LOG(rc); + free(ppn); + goto REPORT_ERROR; + } + free(ppn); + /* now assign locations to the procs */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; } + } + /* compute the ranks and add the proc objects + * to the jdata->procs array */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* and finally, compute the local and node ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + } + + /* now that the node array in the job map and jdata are completely filled out,. + * we need to "wireup" the procs to their nodes so other utilities can + * locate them */ + for (n=0; n < jdata->procs->size; n++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { + continue; + } + if (ORTE_PROC_STATE_UNDEF == pptr->state) { + /* not ready for use yet */ + continue; + } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* the parser will have already made the connection, but the fully described + * case won't have done it, so connect the proc to its node here */ opal_output_verbose(5, orte_odls_base_framework.framework_output, "%s GETTING DAEMON FOR PROC %s WITH PARENT %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -401,86 +456,37 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } OBJ_RETAIN(dmn->node); pptr->node = dmn->node; - /* add proc to node - note that num_procs for the - * node was already correctly unpacked, so don't - * increment it here */ - OBJ_RETAIN(pptr); - opal_pointer_array_add(dmn->node->procs, pptr); - - /* add the node to the map, if not already there */ - if (!ORTE_FLAG_TEST(dmn->node, ORTE_NODE_FLAG_MAPPED)) { - OBJ_RETAIN(dmn->node); - ORTE_FLAG_SET(dmn->node, ORTE_NODE_FLAG_MAPPED); - opal_pointer_array_add(jdata->map->nodes, dmn->node); - if (newmap) { - jdata->map->num_nodes++; - } - } - - /* see if it belongs to us */ - if (pptr->parent == ORTE_PROC_MY_NAME->vpid) { - /* is this child on our current list of children */ - if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { - /* not on the local list */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, - "%s[%s:%d] adding proc %s to my local list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&pptr->name))); - /* keep tabs of the number of local procs */ - jdata->num_local_procs++; - /* add this proc to our child list */ - OBJ_RETAIN(pptr); - ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); - opal_pointer_array_add(orte_local_children, pptr); - } - - /* if the job is in restart mode, the child must not barrier when launched */ - if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { - orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); - } - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); - ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); - } - } - } else { - /* create the map - will already have been done for the novm case */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; } - /* find our local procs */ - for (n=0; n < jdata->map->nodes->size; n++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { - continue; - } - if (node->index != (int)ORTE_PROC_MY_NAME->vpid) { - continue; + /* see if it belongs to us */ + if (pptr->parent == ORTE_PROC_MY_NAME->vpid) { + /* is this child on our current list of children */ + if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { + /* not on the local list */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, + "%s[%s:%d] adding proc %s to my local list", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + __FILE__, __LINE__, + ORTE_NAME_PRINT(&pptr->name))); + /* keep tabs of the number of local procs */ + jdata->num_local_procs++; + /* add this proc to our child list */ + OBJ_RETAIN(pptr); + ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); + opal_pointer_array_add(orte_local_children, pptr); } - for (m=0; m < node->procs->size; m++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, m))) { - continue; - } - if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { - /* not on the local list */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, - "%s[%s:%d] adding proc %s to my local list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&pptr->name))); - /* keep tabs of the number of local procs */ - jdata->num_local_procs++; - /* add this proc to our child list */ - OBJ_RETAIN(pptr); - ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); - opal_pointer_array_add(orte_local_children, pptr); - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); - ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); - } + + /* if the job is in restart mode, the child must not barrier when launched */ + if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { + orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } + /* mark that this app_context is being used on this node */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); + ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); } + } + + if (!ORTE_PROC_IS_HNP && + !orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { /* compute and save bindings of local children */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); @@ -488,13 +494,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } } - /* reset any node map flags we used so the next job will start clean */ - for (n=0; n < jdata->map->nodes->size; n++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); - } - } - /* if we wanted to see the map, now is the time to display it */ if (jdata->map->display_map) { orte_rmaps_base_display_map(jdata); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 677535aacf6..0c54807a7e6 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -209,7 +209,7 @@ static void files_ready(int status, void *cbdata) if (ORTE_SUCCESS != status) { ORTE_FORCED_TERMINATE(status); } else { - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } } @@ -1497,7 +1497,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, /* convert the nodes with daemons to a regex */ param = NULL; - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(¶m))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, ¶m))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rmaps/base/Makefile.am b/orte/mca/rmaps/base/Makefile.am index 41b0420847c..d2930632ea4 100644 --- a/orte/mca/rmaps/base/Makefile.am +++ b/orte/mca/rmaps/base/Makefile.am @@ -12,7 +12,7 @@ # Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2015 Intel, Inc. All rights reserved. +# Copyright (c) 2015-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -31,7 +31,8 @@ libmca_rmaps_la_SOURCES += \ base/rmaps_base_support_fns.c \ base/rmaps_base_ranking.c \ base/rmaps_base_print_fns.c \ - base/rmaps_base_binding.c + base/rmaps_base_binding.c \ + base/rmaps_base_assign_locations.c dist_ortedata_DATA = base/help-orte-rmaps-base.txt diff --git a/orte/mca/rmaps/base/base.h b/orte/mca/rmaps/base/base.h index b1f540241a7..beb4cee0445 100644 --- a/orte/mca/rmaps/base/base.h +++ b/orte/mca/rmaps/base/base.h @@ -99,7 +99,8 @@ OBJ_CLASS_DECLARATION(orte_rmaps_base_selected_module_t); /* * Map a job */ -ORTE_DECLSPEC int orte_rmaps_base_map_job(orte_job_t *jdata); +ORTE_DECLSPEC void orte_rmaps_base_map_job(int sd, short args, void *cbdata); +ORTE_DECLSPEC int orte_rmaps_base_assign_locations(orte_job_t *jdata); /** * Utility routines to get/set vpid mapping for the job diff --git a/orte/mca/rmaps/base/help-orte-rmaps-base.txt b/orte/mca/rmaps/base/help-orte-rmaps-base.txt index c04acf413d9..2f5f5b5d0c7 100644 --- a/orte/mca/rmaps/base/help-orte-rmaps-base.txt +++ b/orte/mca/rmaps/base/help-orte-rmaps-base.txt @@ -13,7 +13,7 @@ # Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -410,3 +410,13 @@ Either the -host or -hostfile options were given, but the number of processes to start was omitted. This combination is not supported. Please specify the number of processes to run and try again. +# +[failed-assignments] +The attempt to assign hardware locations to processes on a +compute node failed: + + Node: %s + Policy: %s + +We cannot continue - please check that the policy is in +accordance with the actual available hardware. diff --git a/orte/mca/rmaps/base/rmaps_base_assign_locations.c b/orte/mca/rmaps/base/rmaps_base_assign_locations.c new file mode 100644 index 00000000000..b1536ded0aa --- /dev/null +++ b/orte/mca/rmaps/base/rmaps_base_assign_locations.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include + +#include "orte/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/rmaps/base/rmaps_private.h" + + +int orte_rmaps_base_assign_locations(orte_job_t *jdata) +{ + int rc; + orte_rmaps_base_selected_module_t *mod; + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps: assigning locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* cycle thru the available mappers until one agrees to assign + * locations for the job + */ + if (1 == opal_list_get_size(&orte_rmaps_base.selected_modules)) { + /* forced selection */ + mod = (orte_rmaps_base_selected_module_t*)opal_list_get_first(&orte_rmaps_base.selected_modules); + jdata->map->req_mapper = strdup(mod->component->mca_component_name); + } + OPAL_LIST_FOREACH(mod, &orte_rmaps_base.selected_modules, orte_rmaps_base_selected_module_t) { + if (NULL == mod->module->assign_locations) { + continue; + } + if (ORTE_SUCCESS == (rc = mod->module->assign_locations(jdata))) { + return rc; + } + /* mappers return "next option" if they didn't attempt to + * process the job. anything else is a true error. + */ + if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + /* if we get here without doing the assignments, then that's an error */ + orte_show_help("help-orte-rmaps-base.txt", "failed-assignments", true, + orte_process_info.nodename, + orte_rmaps_base_print_mapping(jdata->map->mapping)); + return ORTE_ERROR; +} diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 8254bcfaf16..d5e2ac304dc 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -42,8 +42,10 @@ #include "orte/mca/rmaps/base/rmaps_private.h" -int orte_rmaps_base_map_job(orte_job_t *jdata) +void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; orte_node_t *node; int rc, i, ppx = 0; bool did_map, given, pernode = false; @@ -116,7 +118,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) /* inform the user of the error */ orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true); OPAL_LIST_DESTRUCT(&nodes); - return ORTE_ERR_BAD_PARAM; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } nprocs += slots; @@ -335,7 +339,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) int i; if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } t0 = node->topology; for (i=1; i < orte_node_pool->size; i++) { @@ -368,15 +374,26 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } + /* reset any node map flags we used so the next job will start clean */ + for (i=0; i < jdata->map->nodes->size; i++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + } + if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) { /* the map was done but nothing could be mapped * for launch as all the resources were busy */ orte_show_help("help-orte-rmaps-base.txt", "cannot-launch", true); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } /* if we get here without doing the map, or with zero procs in @@ -386,7 +403,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) orte_show_help("help-orte-rmaps-base.txt", "failed-map", true, did_map ? "mapped" : "unmapped", jdata->num_procs, jdata->map->num_nodes); - return ORTE_ERR_INVALID_NUM_PROCS; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } /* if any node is oversubscribed, then check to see if a binding @@ -399,17 +418,29 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) } } - /* compute and save local ranks */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* compute and save location assignments */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; + } + } else { + /* compute and save local ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; + } - if (orte_no_vm) { /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } @@ -427,7 +458,11 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) } } - return ORTE_SUCCESS; + /* set the job state to the next position */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); + + /* cleanup */ + OBJ_RELEASE(caddy); } void orte_rmaps_base_display_map(orte_job_t *jdata) diff --git a/orte/mca/rmaps/base/rmaps_base_ranking.c b/orte/mca/rmaps/base/rmaps_base_ranking.c index b297290a4d6..cb5d6a09a0c 100644 --- a/orte/mca/rmaps/base/rmaps_base_ranking.c +++ b/orte/mca/rmaps/base/rmaps_base_ranking.c @@ -49,19 +49,17 @@ #include "orte/mca/rmaps/base/base.h" static int rank_span(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, hwloc_obj_type_t target, unsigned cache_level) { + orte_app_context_t *app; hwloc_obj_t obj; - int num_objs, i, j, rc; + int num_objs, i, j, m, n, rc; orte_vpid_t num_ranked=0; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; orte_vpid_t vpid; int cnt; - opal_list_item_t *item; hwloc_obj_t locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, @@ -85,18 +83,144 @@ static int rank_span(orte_job_t *jdata, * are mapped */ - vpid = jdata->num_procs; - cnt = 0; - while (cnt < app->num_procs) { - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + + cnt = 0; + while (cnt < app->num_procs) { + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + /* get the number of objects - only consider those we can actually use */ + num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, + cache_level, OPAL_HWLOC_AVAILABLE); + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: found %d objects on node %s with %d procs", + num_objs, node->name, (int)node->num_procs); + if (0 == num_objs) { + return ORTE_ERR_NOT_SUPPORTED; + } + + /* for each object */ + for (i=0; i < num_objs && cnt < app->num_procs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, + cache_level, i, OPAL_HWLOC_AVAILABLE); + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: working object %d", i); + + /* cycle thru the procs on this node */ + for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs that are already assigned */ + if (ORTE_VPID_INVALID != proc->name.vpid) { + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + /* protect against bozo case */ + locale = NULL; + if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; + } + /* ignore procs not on this object */ + if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: proc at position %d is not on object %d", + j, i); + continue; + } + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid)); + proc->name.vpid = vpid++; + if (0 == cnt) { + app->first_rank = proc->name.vpid; + } + cnt++; + + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + /* move to next object */ + break; + } + } + } + } + } + + return ORTE_SUCCESS; +} + +static int rank_fill(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level) +{ + orte_app_context_t *app; + hwloc_obj_t obj; + int num_objs, i, j, m, n, rc; + orte_vpid_t num_ranked=0; + orte_node_t *node; + orte_proc_t *proc, *pptr; + orte_vpid_t vpid; + int cnt; + hwloc_obj_t locale; + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_fill: for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* if the ranking is fill, then we rank all the procs + * within a given object before moving on to the next + * + * Node 0 Node 1 + * Obj 0 Obj 1 Obj 0 Obj 1 + * 0 1 4 5 8 9 12 13 + * 2 3 6 7 10 11 14 15 + */ + + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + + cnt = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } /* get the number of objects - only consider those we can actually use */ num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: found %d objects on node %s with %d procs", + "mca:rmaps:rank_fill: found %d objects on node %s with %d procs", num_objs, node->name, (int)node->num_procs); if (0 == num_objs) { return ORTE_ERR_NOT_SUPPORTED; @@ -108,7 +232,7 @@ static int rank_span(orte_job_t *jdata, cache_level, i, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: working object %d", i); + "mca:rmaps:rank_fill: working object %d", i); /* cycle thru the procs on this node */ for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { @@ -118,7 +242,7 @@ static int rank_span(orte_job_t *jdata, /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d", + "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d", ORTE_NAME_PRINT(&proc->name), num_ranked); continue; } @@ -130,7 +254,7 @@ static int rank_span(orte_job_t *jdata, if (proc->app_idx != app->idx) { continue; } - /* protect against bozo case */ + /* protect against bozo case */ locale = NULL; if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERROR); @@ -139,19 +263,23 @@ static int rank_span(orte_job_t *jdata, /* ignore procs not on this object */ if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: proc at position %d is not on object %d", + "mca:rmaps:rank_fill: proc at position %d is not on object %d", j, i); continue; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid)); + "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid)); proc->name.vpid = vpid++; if (0 == cnt) { app->first_rank = proc->name.vpid; } cnt++; - /* insert the proc into the jdata array - no harm if already there */ + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; @@ -160,8 +288,6 @@ static int rank_span(orte_job_t *jdata, * new bookmark */ jdata->bookmark = node; - /* move to next object */ - break; } } } @@ -170,138 +296,26 @@ static int rank_span(orte_job_t *jdata, return ORTE_SUCCESS; } -static int rank_fill(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, - hwloc_obj_type_t target, - unsigned cache_level) -{ - hwloc_obj_t obj; - int num_objs, i, j, rc; - orte_vpid_t num_ranked=0; - orte_node_t *node; - orte_proc_t *proc; - orte_vpid_t vpid; - int cnt; - opal_list_item_t *item; - hwloc_obj_t locale; - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: for job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - - /* if the ranking is fill, then we rank all the procs - * within a given object before moving on to the next - * - * Node 0 Node 1 - * Obj 0 Obj 1 Obj 0 Obj 1 - * 0 1 4 5 8 9 12 13 - * 2 3 6 7 10 11 14 15 - */ - - vpid = jdata->num_procs; - cnt = 0; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - /* get the number of objects - only consider those we can actually use */ - num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, - cache_level, OPAL_HWLOC_AVAILABLE); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: found %d objects on node %s with %d procs", - num_objs, node->name, (int)node->num_procs); - if (0 == num_objs) { - return ORTE_ERR_NOT_SUPPORTED; - } - - /* for each object */ - for (i=0; i < num_objs && cnt < app->num_procs; i++) { - obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, - cache_level, i, OPAL_HWLOC_AVAILABLE); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: working object %d", i); - - /* cycle thru the procs on this node */ - for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs that are already assigned */ - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - /* protect against bozo case */ - locale = NULL; - if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { - ORTE_ERROR_LOG(ORTE_ERROR); - return ORTE_ERROR; - } - /* ignore procs not on this object */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: proc at position %d is not on object %d", - j, i); - continue; - } - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid)); - proc->name.vpid = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.vpid; - } - cnt++; - - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - } - } - } - - return ORTE_SUCCESS; -} - static int rank_by(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, hwloc_obj_type_t target, unsigned cache_level) { + orte_app_context_t *app; hwloc_obj_t obj; - int num_objs, i, j, rc; + int num_objs, i, j, m, n, rc; orte_vpid_t num_ranked=0; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; orte_vpid_t vpid; int cnt; opal_pointer_array_t objs; bool all_done; - opal_list_item_t *item; hwloc_obj_t locale; if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { - return rank_span(jdata, app, nodes, target, cache_level); + return rank_span(jdata, target, cache_level); } else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { - return rank_fill(jdata, app, nodes, target, cache_level); + return rank_fill(jdata, target, cache_level); } /* if ranking is not spanned or filled, then we @@ -316,122 +330,140 @@ static int rank_by(orte_job_t *jdata, * 4 6 5 7 12 14 13 15 */ - /* setup the pointer array */ - OBJ_CONSTRUCT(&objs, opal_pointer_array_t); - opal_pointer_array_init(&objs, 2, INT_MAX, 2); - - vpid = jdata->num_procs; - cnt = 0; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - /* get the number of objects - only consider those we can actually use */ - num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, - cache_level, OPAL_HWLOC_AVAILABLE); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: found %d objects on node %s with %d procs", - num_objs, node->name, (int)node->num_procs); - if (0 == num_objs) { - return ORTE_ERR_NOT_SUPPORTED; - } - /* collect all the objects */ - for (i=0; i < num_objs; i++) { - obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, - cache_level, i, OPAL_HWLOC_AVAILABLE); - opal_pointer_array_set_item(&objs, i, obj); + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; } - /* cycle across the objects, assigning a proc to each one, - * until all procs have been assigned - unfortunately, since - * more than this job may be mapped onto a node, the number - * of procs on the node can't be used to tell us when we - * are done. Instead, we have to just keep going until all - * procs are ranked - which means we have to make one extra - * pass thru the loop - * - * Perhaps someday someone will come up with a more efficient - * algorithm, but this works for now. - */ - all_done = false; - while (!all_done && cnt < app->num_procs) { - all_done = true; - /* cycle across the objects */ - for (i=0; i < num_objs && cnt < app->num_procs; i++) { - obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); + /* setup the pointer array */ + OBJ_CONSTRUCT(&objs, opal_pointer_array_t); + opal_pointer_array_init(&objs, 2, INT_MAX, 2); - /* find the next proc on this object */ - for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs that are already ranked */ - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { - continue; - } - /* ignore procs on other objects */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + cnt = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + + /* get the number of objects - only consider those we can actually use */ + num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, + cache_level, OPAL_HWLOC_AVAILABLE); + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by: found %d objects on node %s with %d procs", + num_objs, node->name, (int)node->num_procs); + if (0 == num_objs) { + OBJ_DESTRUCT(&objs); + return ORTE_ERR_NOT_SUPPORTED; + } + /* collect all the objects */ + for (i=0; i < num_objs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, + cache_level, i, OPAL_HWLOC_AVAILABLE); + opal_pointer_array_set_item(&objs, i, obj); + } + + /* cycle across the objects, assigning a proc to each one, + * until all procs have been assigned - unfortunately, since + * more than this job may be mapped onto a node, the number + * of procs on the node can't be used to tell us when we + * are done. Instead, we have to just keep going until all + * procs are ranked - which means we have to make one extra + * pass thru the loop + * + * Perhaps someday someone will come up with a more efficient + * algorithm, but this works for now. + */ + all_done = false; + while (!all_done && cnt < app->num_procs) { + all_done = true; + /* cycle across the objects */ + for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) { + obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); + /* find the next proc for this job and app_context */ + for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs that are already ranked */ + if (ORTE_VPID_INVALID != proc->name.vpid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* protect against bozo case */ + locale = NULL; + if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; + } + /* ignore procs not on this object */ + if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by: proc at position %d is not on object %d", + j, i); + continue; + } + /* assign the vpid */ + proc->name.vpid = vpid++; + if (0 == cnt) { + app->first_rank = proc->name.vpid; + } + cnt++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: proc at position %d is not on object %d", - j, i); - continue; + "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&objs); + return rc; + } + /* flag that one was mapped */ + all_done = false; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + /* move to next object */ + break; } - proc->name.vpid = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.vpid; - } - cnt++; - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* flag that one was mapped */ - all_done = false; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - /* move to next object */ - break; } } } + /* cleanup */ + OBJ_DESTRUCT(&objs); } - - /* cleanup */ - OBJ_DESTRUCT(&objs); - return ORTE_SUCCESS; } -int orte_rmaps_base_compute_vpids(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes) +int orte_rmaps_base_compute_vpids(orte_job_t *jdata) { orte_job_map_t *map; + orte_app_context_t *app; orte_vpid_t vpid; - int j, cnt; + int j, m, n, cnt; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; int rc; - opal_list_item_t *item; bool one_found; map = jdata->map; @@ -445,7 +477,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by NUMA for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_NODE, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_NODE, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -460,7 +492,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by socket for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_SOCKET, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_SOCKET, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -475,7 +507,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L3cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 3))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 3))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -490,7 +522,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L2cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 2))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 2))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -505,7 +537,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L1cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 1))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 1))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -520,7 +552,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by core for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CORE, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CORE, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -528,6 +560,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, } ORTE_ERROR_LOG(rc); } + opal_output(0, "DONE"); return rc; } @@ -535,7 +568,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by hwthread for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_PU, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -549,26 +582,83 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) || ORTE_RANK_BY_BOARD == ORTE_GET_RANKING_POLICY(map->ranking)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: computing vpids by node for job %s app %d on %d nodes", - ORTE_JOBID_PRINT(jdata->jobid), (int)app->idx, - (int)opal_list_get_size(nodes)); - /* bozo check */ - if (0 == opal_list_get_size(nodes)) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } + "mca:rmaps:base: computing vpids by node for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); /* assign the ranks round-robin across nodes - only one board/node * at this time, so they are equivalent */ - cnt=0; - vpid=jdata->num_procs; - one_found = true; - while (cnt < app->num_procs && one_found) { - one_found = false; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; + vpid=0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + cnt=0; + one_found = true; + while (cnt < app->num_procs && one_found) { + one_found = false; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + if (ORTE_VPID_INVALID != proc->name.vpid) { + continue; + } + proc->name.vpid = vpid++; + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + return rc; + } + cnt++; + one_found = true; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + break; /* move on to next node */ + } + } + } + if (cnt < app->num_procs) { + ORTE_ERROR_LOG(ORTE_ERR_FATAL); + return ORTE_ERR_FATAL; + } + } + return ORTE_SUCCESS; + } + + rankbyslot: + if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) { + /* assign the ranks sequentially */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:base: computing vpids by slot for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; @@ -581,70 +671,25 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, if (proc->app_idx != app->idx) { continue; } - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; + if (ORTE_VPID_INVALID == proc->name.vpid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:base: assigning rank %s to node %s", + ORTE_VPID_PRINT(vpid), node->name); + proc->name.vpid = vpid++; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; } - proc->name.vpid = vpid++; - /* insert the proc into the jdata array - no harm if already there */ + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; } - cnt++; - one_found = true; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - break; /* move on to next node */ - } - } - } - if (cnt < app->num_procs) { - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - return ORTE_SUCCESS; - } - - rankbyslot: - if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) { - /* assign the ranks sequentially */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: computing vpids by slot for job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - vpid = jdata->num_procs; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - - for (j=0; j < node->procs->size; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - if (ORTE_VPID_INVALID == proc->name.vpid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: assigning rank %s to node %s", - ORTE_VPID_PRINT(vpid), node->name); - proc->name.vpid = vpid++; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - } - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; } } } diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index b9003c93f59..cf8b9b71f69 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -351,6 +351,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* the list is empty - if the HNP is allocated, then add it */ if (orte_hnp_is_allocated) { nd = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + OBJ_RETAIN(nd); opal_list_append(allocated_nodes, &nd->super); } else { nd = NULL; @@ -476,8 +477,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* if the hnp was not allocated, or flagged not to be used, * then remove it here */ if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) { - node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - if (node == (orte_node_t*)item) { + if (0 == node->index) { opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ item = next; @@ -508,24 +508,24 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr continue; } if (node->slots > node->slots_inuse) { - /* add the available slots */ - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s node %s has %d slots available", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, node->slots - node->slots_inuse)); - num_slots += node->slots - node->slots_inuse; - item = next; - continue; + /* add the available slots */ + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, + "%s node %s has %d slots available", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, node->slots - node->slots_inuse)); + num_slots += node->slots - node->slots_inuse; + item = next; + continue; } if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { - /* nothing needed to do here - we don't add slots to the - * count as we don't have any available. Just let the mapper - * do what it needs to do to meet the request - */ - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s node %s is fully used, but available for oversubscription", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name)); + /* nothing needed to do here - we don't add slots to the + * count as we don't have any available. Just let the mapper + * do what it needs to do to meet the request + */ + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, + "%s node %s is fully used, but available for oversubscription", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); } else { /* if we cannot use it, remove it from list */ opal_list_remove_item(allocated_nodes, item); diff --git a/orte/mca/rmaps/base/rmaps_private.h b/orte/mca/rmaps/base/rmaps_private.h index 8950a1b76df..d9e7f9dcfe0 100644 --- a/orte/mca/rmaps/base/rmaps_private.h +++ b/orte/mca/rmaps/base/rmaps_private.h @@ -12,6 +12,7 @@ * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,9 +56,7 @@ ORTE_DECLSPEC orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata, ORTE_DECLSPEC orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, orte_job_t *jdata); -ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes); +ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata); ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata); diff --git a/orte/mca/rmaps/lama/.opal_ignore b/orte/mca/rmaps/lama/.opal_ignore deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/orte/mca/rmaps/lama/Makefile.am b/orte/mca/rmaps/lama/Makefile.am deleted file mode 100644 index 0512f8b10da..00000000000 --- a/orte/mca/rmaps/lama/Makefile.am +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. -# -# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_ortedata_DATA = help-orte-rmaps-lama.txt - -sources = \ - rmaps_lama_module.c \ - rmaps_lama_max_tree.c \ - rmaps_lama_params.c \ - rmaps_lama.h \ - rmaps_lama_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_rmaps_lama_DSO -component_noinst = -component_install = mca_rmaps_lama.la -else -component_noinst = libmca_rmaps_lama.la -component_install = -endif - -mcacomponentdir = $(ortelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_rmaps_lama_la_SOURCES = $(sources) -mca_rmaps_lama_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_rmaps_lama_la_SOURCES =$(sources) -libmca_rmaps_lama_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt b/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt deleted file mode 100644 index f1b7239bb4f..00000000000 --- a/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt +++ /dev/null @@ -1,173 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. -# Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. -# -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for LAMA Mapper. -# -[orte-rmaps-lama:multi-apps-and-zero-np] -RMAPS found multiple applications to be launched, with at least one that failed -to specify the number of processes to execute. When specifying multiple -applications, you must specify how many processes of each to launch via the --np argument. -# -[orte-rmaps-lama:oversubscribe] -RMaps LAMA detected oversubscription after mapping %d of %d processes. -Since you have asked not to oversubscribe the resources the job will not -be launched. If you would instead like to oversubscribe the resources -try using the --oversubscribe option to mpirun. -# -[orte-rmaps-lama:no-resources-available] -RMaps LAMA detected that there are not enough resources to map the -remainder of the job. Check the command line options, and the number of -nodes allocated to this job. - Application Context : %d - # of Processes Successfully Mapped: %d - # of Processes Requested : %d - Mapping : %s - Binding : %s - MPPR : %s - Ordering : %s -# -[orte-rmaps-lama:merge-conflict-bad-prune-src] -RMaps LAMA detected that it needed to prune a level of the hierarchy that -was necessary for one of the command line parameters. Check your allocation -and the options below to make sure they are correct. - Conflicting Level Description: %s - Mapping : %s - Binding : %s - MPPR : %s - Ordering : %s -# -[invalid mapping option] -The specified mapping option is not supported with the LAMA rmaps -mapper: - - Specified mapping option: %s - Reason it is invalid: %s - -LAMA supports the following options to the mpirun --map-by option: - - node, numa, socket, l1cache, l2cache, l3cache, core, hwthread, slot - -Alternatively, LAMA supports specifying a sequence of letters in the -rmaps_lama_map MCA parameter; each letter indicates a "direction" for -mapping. The rmaps_lama_map MCA parameter is richer/more flexible -than the --may-by CLI option. If rmaps_lama_map is specified, the -following letters must be specified: - - h: hardware thread - c: processor core - s: processor socket - n: node (server) - -The following may also optionally be included in the mapping string: - - N: NUMA node - L1: L1 cache - L2: L2 cache - L3: L3 cache - -For example, the two commands below are equivalent: - - mpirun --mca rmaps lama --mca rmaps_lama_map csNh ... - mpirun --mca rmaps lama --map-by core ... -# -[invalid binding option] -The specified binding option is not supported with the LAMA rmaps -mapper: - - Specified binding option: %s - Reason it is invalid: %s - -LAMA binding options can be specified via the mpirun --bind-to command -line option or rmaps_lama_bind MCA param: - - --bind-to rmaps_lama_binding - Locality option option - ---------------- --------- ------------------ - Hardware thread hwthread h - Processor core core c - Processor socket socket s - NUMA node numa N - L1 cache l1cache L1 - L2 cache l2cache L2 - L3 cache l3cache L3 - Node (server) node n - -The --bind-to option assumes a single locality (e.g., bind each MPI -process to a single core, socket, etc.). The rmaps_lama_bind MCA -param requires an integer specifying how many localities to which to -bind. For example, the following two command lines are equivalent, -and bind each MPI process to a single core: - - mpirun --btl rmaps lama --mca rmaps_lama_bind 1c ... - mpirun --btl rmaps lama --bind-to core ... - -The rmaps_lama_bind MCA parameter is more flexible than the --bind-to -CLI option, because it allows binding to multiple resources. For -example, specifing an rmaps_lama_bind value of "2c" binds each MPI -process to two cores. -# -[invalid ordering option] -The specified ordering option is not supported. - - Specified ordering option: %s - -The LAMA ordering can be specified via the rmaps_lama_ordering MCA -parameter. - -Two options are supported for ordering ranks in MPI_COMM_WORLD (MCW): - - s: Sequential. MCW rank ordering is sequential by hardware thread - across all nodes. E.g., MCW rank 0 is the first process on node - 0; MCW rank 1 is the second process on node 0, and so on. - n: Natural. MCW rank ordering follows the "natural" mapping layout. - For example, in a by-socket layout, MCW rank 0 is the first - process on the 1st socket on node 0. MCW rank 1 is then the - first process on the 2nd socket on node 0. And so on. -# -[invalid mppr option] -The specified Max Processes Per Resource (MPPR) value is invalid (in -the rmaps_lama_mppr MCA paramter): - - Specified MPPR: %s - Reason is is invalid: %s - -The MPPR is a comma-delimited list of specifications indicating how -many processes are allowed on a given type of resource before an MPI -job is considered to have oversubscribed that resource. Each -specification is a token in the format of "NUMBER:RESOURCE". For -example, the default MPPR of "1:c" means that Open MPI will map one -process per processor core before considering cores to be -oversubscribed. - -Multiple specifications may be useful; for example "1:c,2:s" maintains -the default one-process-per-core limitation, but places an additional -limitation of only two processes per processor socket (assuming that -there are more than two cores per socket). - -The LAMA MPPR specifications are set via the rmaps_lama_mppr MCA -parameter. The following resources can be specified: - - Hardware thread h - Processor core c - Processor socket s - NUMA node N - L1 cache L1 - L2 cache L2 - L3 cache L3 - Node (server) n -# -[internal error] -An unexpected internal error occurred in the LAMA mapper; your job -will now fail. Sorry. - - File: %s - Message: %s diff --git a/orte/mca/rmaps/lama/owner.txt b/orte/mca/rmaps/lama/owner.txt deleted file mode 100644 index 0cc0384f0eb..00000000000 --- a/orte/mca/rmaps/lama/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: CISCO -status: maintenance diff --git a/orte/mca/rmaps/lama/rmaps_lama.h b/orte/mca/rmaps/lama/rmaps_lama.h deleted file mode 100644 index 8cb830f861e..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2013-2017 Cisco Systems, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Resource Mapping - */ -#ifndef ORTE_RMAPS_LAMA_H -#define ORTE_RMAPS_LAMA_H - -#include "orte_config.h" - -#include "opal/class/opal_tree.h" - -#include "orte/mca/rmaps/rmaps.h" - -BEGIN_C_DECLS - -ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_lama_component; - -extern orte_rmaps_base_module_t orte_rmaps_lama_module; - - -/********************************* - * Structures & Defines - *********************************/ -/* - * JJH: Can we reuse the opal_hwloc_level_t data structure in - * opal/mca/hwloc/hwloc-internal.h - */ -typedef enum { - LAMA_LEVEL_MACHINE = 0, - LAMA_LEVEL_BOARD = 1, - LAMA_LEVEL_NUMA = 2, - LAMA_LEVEL_SOCKET = 3, - LAMA_LEVEL_CACHE_L3 = 4, - LAMA_LEVEL_CACHE_L2 = 5, - LAMA_LEVEL_CACHE_L1 = 6, - LAMA_LEVEL_CORE = 7, - LAMA_LEVEL_PU = 8, - LAMA_LEVEL_UNKNOWN = 9 -} rmaps_lama_level_type_t; - -typedef enum { - LAMA_ORDER_NATURAL = 0, - LAMA_ORDER_SEQ = 1 -} rmaps_lama_order_type_t; - -struct rmaps_lama_level_info_t { - rmaps_lama_level_type_t type; - int max_resources; -}; -typedef struct rmaps_lama_level_info_t rmaps_lama_level_info_t; - -/* - * Structure to attach to the hwloc tree - * Accounting for mppr - */ -struct rmaps_lama_hwloc_user_t { - opal_object_t super; - - opal_pointer_array_t *node_mppr; -}; -typedef struct rmaps_lama_hwloc_user_t rmaps_lama_hwloc_user_t; -OBJ_CLASS_DECLARATION(rmaps_lama_hwloc_user_t); - -struct rmaps_lama_node_mppr_t { - int max; - int cur; -}; -typedef struct rmaps_lama_node_mppr_t rmaps_lama_node_mppr_t; - -rmaps_lama_level_type_t lama_type_str_to_enum(char *param); -char * lama_type_enum_to_str(rmaps_lama_level_type_t param); - - -/********************************* - * Command Line Interface Parsing - *********************************/ -/* - * User defined command line interface (CLI) arguments - */ -extern char * rmaps_lama_cmd_map; -extern char * rmaps_lama_cmd_bind; -extern char * rmaps_lama_cmd_mppr; -extern char * rmaps_lama_cmd_ordering; -extern bool rmaps_lama_timing_enabled; -extern bool rmaps_lama_can_oversubscribe; -extern bool rmaps_lama_am_oversubscribing; - -/* - * Internal representations of command line arguments - */ -extern int lama_mapping_num_layouts; -extern rmaps_lama_level_type_t *lama_mapping_layout; - -extern rmaps_lama_level_type_t lama_binding_level; - -extern rmaps_lama_level_info_t *lama_mppr_levels; -extern int lama_mppr_num_levels; - -/* - * Homogeneous system optimization - */ -extern bool lama_mppr_max_tree_homogeneous_system; - -/* - * Maximum length of digits in CLI - */ -#define MAX_BIND_DIGIT_LEN 4 - -int rmaps_lama_process_alias_params(orte_job_t *jdata); - -int rmaps_lama_parse_mapping(char *layout, - rmaps_lama_level_type_t **layout_types, - rmaps_lama_level_type_t **layout_types_sorted, - int *num_types); -int rmaps_lama_parse_binding(char *layout, - rmaps_lama_level_type_t *binding_level, - int *num_types); -int rmaps_lama_parse_mppr(char *layout, - rmaps_lama_level_info_t **mppr_levels, - int *num_types); -int rmaps_lama_parse_ordering(char *layout, - rmaps_lama_order_type_t *order); - -bool rmaps_lama_ok_to_prune_level(rmaps_lama_level_type_t level); - -/********************************* - * Max Tree Structure - *********************************/ -struct rmaps_lama_max_tree_item_t { - opal_tree_item_t tree_element; - - rmaps_lama_level_type_t type; -}; -typedef struct rmaps_lama_max_tree_item_t rmaps_lama_max_tree_item_t; - - -/* - * Union all topologies into the max tree - */ -int rmaps_lama_build_max_tree(orte_job_t *jdata, opal_list_t *node_list, - opal_tree_t * max_tree, bool *is_homogeneous); - -/* - * Find a matching subtree - */ -hwloc_obj_t * rmaps_lama_find_nth_subtree_match(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - rmaps_lama_level_type_t lama_key); -hwloc_obj_t * rmaps_lama_find_parent(hwloc_topology_t hwloc_topo, - hwloc_obj_t *child_obj, - rmaps_lama_level_type_t lama_key); - -/* - * Create Empty Tree - */ -opal_tree_t * rmaps_lama_create_empty_max_tree(void); - -/* - * Pretty Print - */ -void rmaps_lama_max_tree_pretty_print_tree(opal_tree_t *tree); - -END_C_DECLS - -#endif /* ORTE_RMAPS_LAMA_H */ diff --git a/orte/mca/rmaps/lama/rmaps_lama_component.c b/orte/mca/rmaps/lama/rmaps_lama_component.c deleted file mode 100644 index e8734dbec64..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_component.c +++ /dev/null @@ -1,136 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -#include "rmaps_lama.h" - -/* - * Local functions - */ - -static int orte_rmaps_lama_register(void); -static int orte_rmaps_lama_query(mca_base_module_t **module, int *priority); - -static int module_priority; - -char * rmaps_lama_cmd_map = NULL; -char * rmaps_lama_cmd_bind = NULL; -char * rmaps_lama_cmd_mppr = NULL; -char * rmaps_lama_cmd_ordering = NULL; -bool rmaps_lama_timing_enabled = false; -bool rmaps_lama_can_oversubscribe = false; -bool rmaps_lama_am_oversubscribing = false; - -orte_rmaps_base_component_t mca_rmaps_lama_component = { - .base_version = { - ORTE_RMAPS_BASE_VERSION_2_0_0, - - .mca_component_name = "lama", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - .mca_query_component = orte_rmaps_lama_query, - .mca_register_component_params = orte_rmaps_lama_register, - }, - .base_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, -}; - - -static int orte_rmaps_lama_register(void) -{ - mca_base_component_t *c = &mca_rmaps_lama_component.base_version; - - /* JMS Artifically low for now */ - module_priority = 0; - (void) mca_base_component_var_register (c, "priority", "Priority of the LAMA rmaps component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &module_priority); - - rmaps_lama_timing_enabled = false; - (void) mca_base_component_var_register (c, "timing", - "Enable timing information. [Default = disabled]", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_timing_enabled); - - rmaps_lama_cmd_map = NULL; - (void) mca_base_component_var_register (c, "map", "LAMA Map: Process layout iteration ordering (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_map); - - rmaps_lama_cmd_bind = NULL; - (void) mca_base_component_var_register (c, "bind", "LAMA Bind: Bind to the specified number of resources (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_bind); - - rmaps_lama_cmd_mppr = NULL; - (void) mca_base_component_var_register (c, "mppr", "LAMA MPPR: Maximum number of the specified resources available (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_mppr); - - rmaps_lama_cmd_ordering = NULL; - (void) mca_base_component_var_register (c, "ordering", "LAMA Ordering: Ordering (s) sequential, (n) natural - Default: n (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_ordering); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Priority %3d", - module_priority); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Map : %s", - (NULL == rmaps_lama_cmd_map) ? "NULL" : rmaps_lama_cmd_map); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bind : %s", - (NULL == rmaps_lama_cmd_bind) ? "NULL" : rmaps_lama_cmd_bind); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: MPPR : %s", - (NULL == rmaps_lama_cmd_mppr) ? "NULL" : rmaps_lama_cmd_mppr); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Order : %s", - (NULL == rmaps_lama_cmd_ordering) ? "NULL" : rmaps_lama_cmd_ordering); - - return ORTE_SUCCESS; -} - - -static int orte_rmaps_lama_query(mca_base_module_t **module, int *priority) -{ - /* Only run on the HNP */ - - *priority = module_priority; - *module = (mca_base_module_t *)&orte_rmaps_lama_module; - - return ORTE_SUCCESS; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_max_tree.c b/orte/mca/rmaps/lama/rmaps_lama_max_tree.c deleted file mode 100644 index a1183028b3b..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_max_tree.c +++ /dev/null @@ -1,1182 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * Max Tree Support Functions - * - */ -#include "rmaps_lama.h" - -#include "orte/util/show_help.h" - -#include "orte/mca/errmgr/errmgr.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -/********************************* - * Max Tree Construction - *********************************/ -/* - * Convert an hwloc tree to an opal_tree - */ -static int rmaps_lama_convert_hwloc_tree_to_opal_tree(opal_tree_t *opal_tree, - hwloc_topology_t *hwloc_topo); - -/* - * Convert an hwloc subtree to an opal subtree - */ -static int rmaps_lama_convert_hwloc_subtree(hwloc_obj_t obj, - opal_tree_item_t *parent_item); - -/* - * Convert LAMA key to HWLOC key/depth - */ -static int rmaps_lama_convert_lama_key_to_hwloc_key(rmaps_lama_level_type_t lama_key, - hwloc_obj_type_t *hwloc_key, int *depth); - -/* - * Convert HWLOC key/depth to LAMA key - */ -static int rmaps_lama_convert_hwloc_key_to_lama_key(hwloc_obj_type_t hwloc_key, int depth, - rmaps_lama_level_type_t *lama_key); - -/* - * Compare two HWLOC topologies for similar structure - */ -static int rmaps_lama_hwloc_compare_topos(hwloc_topology_t *left, hwloc_topology_t *right); -static int rmaps_lama_hwloc_compare_subtrees(hwloc_obj_t left, hwloc_obj_t right); - -/* - * Merge two opal_trees - */ -static int rmaps_lama_merge_trees(opal_tree_t *src_tree, opal_tree_t *into_tree, - opal_tree_item_t *src_parent, opal_tree_item_t *into_parent); - -/* - * Prune the max tree to just those levels specified - */ -static int rmaps_lama_prune_max_tree(opal_tree_t *max_tree, opal_tree_item_t *parent_item); - -/* - * Annotate the hwloc tree for MPPR accounting - */ -static int rmaps_lama_annotate_node_for_mppr(orte_node_t *node, hwloc_obj_t obj); - -/* - * Access the MPPR for the specified key - */ -static int rmaps_lama_get_mppr_for_key(orte_node_t *node, rmaps_lama_level_type_t lama_key); - -/* - * Recursive core of nth_subtree_match - */ -static int rmaps_lama_find_nth_subtree_match_core(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - int *num_found, - hwloc_obj_type_t hwloc_key, - int depth, - hwloc_obj_t *cur_child); - -static void rmaps_lama_max_tree_item_construct(rmaps_lama_max_tree_item_t *item) -{ - item->type = LAMA_LEVEL_UNKNOWN; -} - - -/********************************* - * Max Tree Accessors/Functions - *********************************/ -OBJ_CLASS_INSTANCE(rmaps_lama_max_tree_item_t, - opal_tree_item_t, - rmaps_lama_max_tree_item_construct, NULL); - -static int lama_max_tree_comp(opal_tree_item_t *item, void *key); -static int lama_max_tree_serialize(opal_tree_item_t *item, opal_buffer_t *buffer); -static int lama_max_tree_deserialize(opal_buffer_t *buffer, opal_tree_item_t **item); -static void * lama_max_tree_get_key(opal_tree_item_t *item); - - -/********************************* - * Max Tree Pretty Print - *********************************/ -static char * rmaps_lama_max_tree_pretty_print_subtree_element_get(opal_tree_t *tree, - opal_tree_item_t *parent, - int level); -static void pretty_print_subtree(opal_tree_t *tree, opal_tree_item_t *parent, int level); -static void pretty_print_subtree_element(opal_tree_t *tree, opal_tree_item_t *parent, int level); - - -/********************************* - * Function Defintions - *********************************/ -int rmaps_lama_build_max_tree(orte_job_t *jdata, opal_list_t *node_list, - opal_tree_t * max_tree, bool *is_homogeneous) -{ - int ret; - opal_tree_t *tmp_tree = NULL; - hwloc_topology_t topo, *last_topo = NULL; - orte_node_t *cur_node = NULL; - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Building the Max Tree..."); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - /* - * Assume homogeneous system, unless otherwise noted - */ - *is_homogeneous = true; - - /* - * Process all other unique trees from remote daemons who are in - * this allocation - */ - for(cur_node = (orte_node_t*)opal_list_get_first(node_list); - cur_node != (orte_node_t*)opal_list_get_end(node_list); - cur_node = (orte_node_t*)opal_list_get_next(cur_node) ) { - if (NULL == (topo = cur_node->topology)) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- No Tree Available: %s (skipping)", cur_node->name); - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Converting Remote Tree: %s", cur_node->name); - - /* - * Convert to opal_tree - */ - tmp_tree = rmaps_lama_create_empty_max_tree(); - rmaps_lama_convert_hwloc_tree_to_opal_tree(tmp_tree, &topo); - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - rmaps_lama_max_tree_pretty_print_tree(tmp_tree); - } - - /* - * Compare the current and last topologies if we are still considering - * this max tree to represent a homogeneous system. - */ - if( *is_homogeneous ) { - if( NULL == last_topo ) { - last_topo = &topo; - } else { - if( 0 != rmaps_lama_hwloc_compare_topos(last_topo, &topo) ) { - *is_homogeneous = false; - } - } - } - - /* - * Prune the input tree so that is only contains levels that the user - * asked for. - */ - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Pruning input Tree..."); - } - if( ORTE_SUCCESS != (ret = rmaps_lama_prune_max_tree(tmp_tree, opal_tree_get_root(tmp_tree))) ) { - return ret; - } - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Input Tree... - Post Prune"); - rmaps_lama_max_tree_pretty_print_tree(tmp_tree); - } - - /* - * Merge into max_tree - */ - if( opal_tree_is_empty(max_tree) ) { - opal_tree_dup(tmp_tree, max_tree); - } else { - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(tmp_tree, - max_tree, - opal_tree_get_root(tmp_tree), - opal_tree_get_root(max_tree) ))) { - return ret; - } - } - - /* - * Release and move on... - */ - OBJ_RELEASE(tmp_tree); - tmp_tree = NULL; - } - - - /* - * Fill out the MPPR accounting information for each node - */ - for(cur_node = (orte_node_t*)opal_list_get_first(node_list); - cur_node != (orte_node_t*)opal_list_get_end(node_list); - cur_node = (orte_node_t*)opal_list_get_next(cur_node) ) { - if( ORTE_SUCCESS != (ret = rmaps_lama_annotate_node_for_mppr(cur_node, - hwloc_get_obj_by_depth(cur_node->topology, 0, 0))) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - /* - * JJH: NEEDS TESTING - * Note: This check is in place, but not used at the moment due to lack of - * system availability. Pending system availability and further testing, - * just assume heterogeneous. - */ - *is_homogeneous = false; - - /* - * Display the final Max Tree - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Final Max Tree... - %s system", - (*is_homogeneous ? "Homogeneous" : "Heterogeneous") ); - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - rmaps_lama_max_tree_pretty_print_tree(max_tree); - } - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_tree_to_opal_tree(opal_tree_t *opal_tree, hwloc_topology_t *hwloc_topo) -{ - hwloc_obj_t topo_root; - - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Converting Topology:"); - /* opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); */ - opal_dss.dump(0, *hwloc_topo, OPAL_HWLOC_TOPO); - } - - topo_root = hwloc_get_root_obj(*hwloc_topo); - - rmaps_lama_convert_hwloc_subtree(topo_root, - opal_tree_get_root(opal_tree)); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_subtree(hwloc_obj_t obj, - opal_tree_item_t *parent_item) -{ - rmaps_lama_max_tree_item_t *max_tree_item = NULL; - char * key_child_str = NULL; - char * key_parent_str = NULL; - - while (obj) { - /* - * Create new tree item - */ - max_tree_item = OBJ_NEW(rmaps_lama_max_tree_item_t); - - /* - * Convert the HWLOC object to the LAMA key - */ - rmaps_lama_convert_hwloc_key_to_lama_key(obj->type, - obj->attr->cache.depth, - &(max_tree_item->type)); - - /* - * Append tree item to parent. Unless it is the same as the - * parent (L1 instruction vs data cache). JJH: Newer versions - * of hwloc can differentiate from the obj->attr->cache.type. - */ - if( NULL != obj->parent && - obj->parent->type == obj->type && - obj->parent->attr->cache.depth == obj->attr->cache.depth ) { - key_child_str = lama_type_enum_to_str(max_tree_item->type); - key_parent_str = lama_type_enum_to_str(((rmaps_lama_max_tree_item_t*)parent_item)->type); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Identical level detected: " - "Child [%s] vs Parent [%s]", - key_child_str, key_parent_str); - free(key_child_str); - free(key_parent_str); - - /* - * Add descendants if they exist - */ - if (obj->first_child) { - rmaps_lama_convert_hwloc_subtree(obj->first_child, - parent_item); - } - } else { - opal_tree_add_child(parent_item, &max_tree_item->tree_element); - - /* - * Add descendants if they exist - */ - if (obj->first_child) { - rmaps_lama_convert_hwloc_subtree(obj->first_child, - &max_tree_item->tree_element); - } - } - - /* - * Advance to next sibling - */ - obj = obj->next_sibling; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_annotate_node_for_mppr(orte_node_t *node, hwloc_obj_t obj) -{ - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - rmaps_lama_level_type_t lama_key; - opal_hwloc_topo_data_t *opal_hwloc_topo = NULL; - int i; - - /* - * Attach our user pointer to the topology, if it is not already there. - * We will fill it in as needed later. - * - * Note: opal/mca/hwloc/base/hwloc_base_util.c attaches their own object - * to the userdata. There is a pointer in that structure we can use without - * interfering with what OPAL is trying to do. - */ - if( NULL == obj->userdata ) { - /* Some objects may not have topo data associated with them - * JJH: This is memory leak :/ Fix. - */ - obj->userdata = (void*)OBJ_NEW(opal_hwloc_topo_data_t); - } - if( NULL != obj->userdata ) { - opal_hwloc_topo = (opal_hwloc_topo_data_t*)(obj->userdata); - - if( NULL == opal_hwloc_topo->userdata ) { - hwloc_userdata = OBJ_NEW(rmaps_lama_hwloc_user_t); - opal_hwloc_topo->userdata = hwloc_userdata; - } else { - hwloc_userdata = (rmaps_lama_hwloc_user_t*)(opal_hwloc_topo->userdata); - } - } - - - /* - * Add node information if it is not already there - */ - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - if( NULL == mppr_accounting ) { - /* - * Add MPPR accounting for this node associated with this object - */ - rmaps_lama_convert_hwloc_key_to_lama_key(obj->type, obj->attr->cache.depth, &lama_key); - - mppr_accounting = (rmaps_lama_node_mppr_t*)malloc(sizeof(rmaps_lama_node_mppr_t)); - mppr_accounting->max = rmaps_lama_get_mppr_for_key(node, lama_key); - mppr_accounting->cur = 0; - - opal_pointer_array_set_item(hwloc_userdata->node_mppr, node->index, mppr_accounting); - } - - - /* - * Decend tree - */ - for(i = 0; i < (int)obj->arity; ++i ) { - rmaps_lama_annotate_node_for_mppr(node, - obj->children[i]); - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_get_mppr_for_key(orte_node_t *node, rmaps_lama_level_type_t lama_key) -{ - int i; - - for( i = 0; i < lama_mppr_num_levels; ++i ) { - if( lama_key == lama_mppr_levels[i].type ) { - return lama_mppr_levels[i].max_resources; - } - } - - return -1; -} - -static int rmaps_lama_convert_lama_key_to_hwloc_key(rmaps_lama_level_type_t lama_key, hwloc_obj_type_t *hwloc_key, int *depth) -{ - *depth = 0; - - switch(lama_key) { - case LAMA_LEVEL_MACHINE: - *hwloc_key = HWLOC_OBJ_MACHINE; - break; - /* Note: HWLOC does not support boards */ -#if 0 - case LAMA_LEVEL_BOARD: - *hwloc_key = HWLOC_OBJ_MACHINE; - break; -#endif - case LAMA_LEVEL_SOCKET: - *hwloc_key = HWLOC_OBJ_SOCKET; - break; - case LAMA_LEVEL_CORE: - *hwloc_key = HWLOC_OBJ_CORE; - break; - case LAMA_LEVEL_PU: - *hwloc_key = HWLOC_OBJ_PU; - break; - case LAMA_LEVEL_CACHE_L1: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 1; - break; - case LAMA_LEVEL_CACHE_L2: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 2; - break; - case LAMA_LEVEL_CACHE_L3: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 3; - break; - case LAMA_LEVEL_NUMA: - *hwloc_key = HWLOC_OBJ_NODE; - break; - default: - *hwloc_key = HWLOC_OBJ_TYPE_MAX; - break; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_key_to_lama_key(hwloc_obj_type_t hwloc_key, int depth, rmaps_lama_level_type_t *lama_key) -{ - switch(hwloc_key) { - case HWLOC_OBJ_MACHINE: - *lama_key = LAMA_LEVEL_MACHINE; - break; - /* Node: HWLOC does not support boards */ -#if 0 - case HWLOC_OBJ_BOARD: - *lama_key = LAMA_LEVEL_BOARD; - break; -#endif - case HWLOC_OBJ_SOCKET: - *lama_key = LAMA_LEVEL_SOCKET; - break; - case HWLOC_OBJ_CORE: - *lama_key = LAMA_LEVEL_CORE; - break; - case HWLOC_OBJ_PU: - *lama_key = LAMA_LEVEL_PU; - break; - case HWLOC_OBJ_CACHE: - if( 1 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L1; - } - else if( 2 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L2; - } - else if( 3 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L3; - } - else { - *lama_key = LAMA_LEVEL_UNKNOWN; - } - break; - case HWLOC_OBJ_NODE: - *lama_key = LAMA_LEVEL_NUMA; - break; - default: - *lama_key = LAMA_LEVEL_UNKNOWN; - break; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_hwloc_compare_topos(hwloc_topology_t *left, hwloc_topology_t *right) -{ - hwloc_obj_t left_root; - hwloc_obj_t right_root; - - /* - * Note: I hope that there is a 'better' way of doing this natively with - * HWLOC, but it is not obvious if they have the ability to compare - * topologies. So do a depth first comparison of the trees. - * You may be able to use the below: - * OPAL_EQUAL != opal_dss.compare(*last_topo, topo, OPAL_HWLOC_TOPO); - */ - - left_root = hwloc_get_obj_by_depth(*left, 0, 0); - right_root = hwloc_get_obj_by_depth(*right, 0, 0); - - return rmaps_lama_hwloc_compare_subtrees(left_root, right_root); -} - -static int rmaps_lama_hwloc_compare_subtrees(hwloc_obj_t left, hwloc_obj_t right) -{ - int i, ret; - - /* - * Check Types - */ - if( 0 != (ret = hwloc_compare_types(left->type, right->type)) ) { - return ret; - } - - /* - * Check 'arity' at this level - */ - if( left->arity > right->arity ) { - return -1; - } - else if( left->arity < right->arity ) { - return 1; - } - - /* - * Check all subtrees - */ - for(i = 0; i < (int)left->arity; ++i ) { - if( 0 != (ret = rmaps_lama_hwloc_compare_subtrees(left->children[i], - right->children[i])) ) { - return ret; - } - } - - /* - * Subtree is the same if we get here - */ - return 0; -} - -static int rmaps_lama_merge_trees(opal_tree_t *src_tree, opal_tree_t *max_tree, - opal_tree_item_t *src_parent, opal_tree_item_t *max_parent) -{ - int ret, exit_status = ORTE_SUCCESS; - rmaps_lama_level_type_t *key_src, *key_max; - opal_tree_item_t *child_item = NULL, *max_grandparent = NULL; - opal_tree_item_t *max_child_item = NULL; - int num_max, num_src; - int i; - char *key_src_str = NULL; - char *key_max_str = NULL; -#if 1 - char *str = NULL; -#endif - - /* - * Basecase - */ - if( NULL == src_parent ) { - return ORTE_SUCCESS; - } - - key_src = (rmaps_lama_level_type_t*)src_tree->get_key(src_parent); - key_max = (rmaps_lama_level_type_t*)max_tree->get_key(max_parent); - - key_src_str = lama_type_enum_to_str(*key_src); - key_max_str = lama_type_enum_to_str(*key_max); - - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: CHECK: Merge Trees: Keys Src (%2d - %s) vs Max (%2d - %s)", - *key_src, key_src_str, *key_max, key_max_str); - } - - /* - * Make sure keys at this level match. - * - * JJH: Give up if they do not match. - * JJH: We should pick a victim and prune from the tree - * JJH: preferably from the 'native' tree. - */ - if( 0 != max_tree->comp(max_parent, src_tree->get_key(src_parent)) ) { - /* - * If the source conflicts due to cache, iterate to children to find a match. - * JJH: Double check this for different heterogenous systems - */ - if( LAMA_LEVEL_CACHE_L3 == *key_src || - LAMA_LEVEL_CACHE_L2 == *key_src || - LAMA_LEVEL_CACHE_L1 == *key_src || - LAMA_LEVEL_NUMA == *key_src ) { - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Merge Trees: " - "Src with Conflicting Memory Hierarchy [Src (%2d - %s) vs Max (%2d - %s)]", - *key_src, key_src_str, *key_max, key_max_str); - - /* - * If we are pruning a cache level, then check to make sure it is - * not important to the process layout. - */ - if( !rmaps_lama_ok_to_prune_level(*key_src) ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:merge-conflict-bad-prune-src", - true, - key_src_str, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * If the number of children at this pruned level was larger than - * the max tree arity at this level, then duplicate the max_tree - * element the approprate number of times - */ - max_grandparent = opal_tree_get_parent(max_parent); - num_max = opal_tree_num_children(max_grandparent); - num_src = opal_tree_num_children(src_parent); - - for(i = 0; i < (num_src - num_max); ++i ) { -#if 1 - str = rmaps_lama_max_tree_pretty_print_subtree_element_get(max_tree, max_parent, 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Merge: Appending child %s - post prune", - str); - free(str); -#endif - /* Duplicate max child subtree */ - opal_tree_copy_subtree(max_tree, max_parent, max_tree, max_grandparent); - } - - /* - * Iterate to children, until we find a match - */ - for(child_item = opal_tree_get_first_child(src_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item) ) { - - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(src_tree, - max_tree, - child_item, - max_parent)) ) { - exit_status = ret; - goto cleanup; - } - } - - exit_status = ORTE_SUCCESS; - goto cleanup; - } - /* - * If the max tree conflicts due to cache, then we need to prune the - * max tree until it matches. - * JJH: If we are pruning a level of the hierarchy then make sure we - * JJH: don't need it for the process layout. - */ - else if( LAMA_LEVEL_CACHE_L3 == *key_max || - LAMA_LEVEL_CACHE_L2 == *key_max || - LAMA_LEVEL_CACHE_L1 == *key_max || - LAMA_LEVEL_NUMA == *key_max ) { - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Merge Trees: " - "Max with Conflicting Memory Hierarchy [Src (%2d - %s) vs Max (%2d - %s)]", - *key_src, key_src_str, *key_max, key_max_str); - - /* - * If we are pruning a cache level, then check to make sure it is - * not important to the process layout. - */ - if( !rmaps_lama_ok_to_prune_level(*key_max) ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:merge-conflict-bad-prune-src", - true, - key_max_str, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } - - max_child_item = opal_tree_get_first_child(max_parent); - /* Prune parent */ - opal_tree_remove_item(max_tree, max_parent); - - /* Try again with child */ - exit_status = rmaps_lama_merge_trees(src_tree, - max_tree, - src_parent, - max_child_item); - goto cleanup; - } - - /* - * If we cannot resolve it, give up. - */ - opal_output(0, "mca:rmaps:lama: Error: Merge Trees: " - "Different Keys Src (%2d - %s) vs Max (%2d - %s) - Do not know how to resolve - give up!", - *key_src, key_src_str, *key_max, key_max_str); - - exit_status = ORTE_ERROR; - goto cleanup; - } - - num_max = opal_tree_num_children(max_parent); - num_src = opal_tree_num_children(src_parent); - - /* - * If the 'native' tree has more children than the 'max' tree. - * Add the missing children to the 'max' tree. - */ - if( num_max < num_src ) { - i = 0; - for(child_item = opal_tree_get_first_child(src_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item)) { - if(i >= num_max ) { -#if 1 - str = rmaps_lama_max_tree_pretty_print_subtree_element_get(src_tree, child_item, 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Merge: Appending child %s", - str); - free(str); -#endif - /* Add child's subtree to max */ - opal_tree_copy_subtree(src_tree, child_item, max_tree, max_parent); - } - ++i; - } - } - - /* - * Recursively search all children of 'native' tree. - * - * Note: Only need to add the children to the 'left-most' branch of the - * 'max' tree since that is the only branch that is searched during mapping. - * But do the whole thing for good measure. - */ - for( child_item = opal_tree_get_first_child(src_parent), - max_child_item = opal_tree_get_first_child(max_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item), - max_child_item = opal_tree_get_next_sibling(max_child_item) ) { - - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(src_tree, - max_tree, - child_item, - max_child_item)) ) { - exit_status = ret; - goto cleanup; - } - } - - cleanup: - if( NULL != key_src_str ) { - free(key_src_str); - key_src_str = NULL; - } - - if( NULL != key_max_str ) { - free(key_max_str); - key_max_str = NULL; - } - - return exit_status; -} - -static int rmaps_lama_prune_max_tree(opal_tree_t *max_tree, opal_tree_item_t *parent_item) -{ - int ret; - opal_tree_item_t *child_item = NULL, *next_item; - int i; - bool found; - rmaps_lama_level_type_t *key_max; - char *tmp_str = NULL; - - /* - * Basecase - */ - if( NULL == parent_item ) { - return ORTE_SUCCESS; - } - - /* - * Recursively decend tree - Depth first - * Basecase: No children, loop skipped - */ - child_item = opal_tree_get_first_child(parent_item); - while( child_item != NULL ) { - /* Do this before the recursive call, since it might remove this - * child so we need to preserve a pointer to the next sibling. - */ - next_item = opal_tree_get_next_sibling(child_item); - - if( ORTE_SUCCESS != (ret = rmaps_lama_prune_max_tree(max_tree, - child_item)) ) { - return ret; - } - - child_item = next_item; - } - - key_max = (rmaps_lama_level_type_t*)max_tree->get_key(parent_item); - - /* - * Check keys against the user supplied layout - */ - found = false; - for(i = 0; i < lama_mapping_num_layouts; ++i ) { - if( 0 == max_tree->comp(parent_item, &lama_mapping_layout[i]) ) { - found = true; - break; - } - } - - if( !found ) { - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - tmp_str = lama_type_enum_to_str(*key_max); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Before pruning %s", - tmp_str); - free(tmp_str); - rmaps_lama_max_tree_pretty_print_tree(max_tree); - } - - opal_tree_remove_item(max_tree, parent_item); - - return ORTE_SUCCESS; - } - - return ORTE_SUCCESS; -} - - -hwloc_obj_t * rmaps_lama_find_nth_subtree_match(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - rmaps_lama_level_type_t lama_key) -{ - hwloc_obj_t *cur_child = NULL; - hwloc_obj_type_t hwloc_key; - int depth; - int num_found; -#if 0 - char str[128]; -#endif - - cur_child = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - - /* - * Convert LAMA key to HWLOC key - */ - rmaps_lama_convert_lama_key_to_hwloc_key(lama_key, &hwloc_key, &depth); - - /* - * Decend tree looking for the n'th matching subtree - */ - num_found = -1; - rmaps_lama_find_nth_subtree_match_core(hwloc_topo, - parent_obj, - nth, - &num_found, - hwloc_key, - depth, - cur_child); - - /* - * Check to see if we found it - */ -#if 0 - hwloc_obj_snprintf(str, sizeof(str), hwloc_topo, *cur_child, "#", 0); - if( nth == num_found ) { - printf("--> FOUND : %-20s \t -- \t %2d of %2d\n", str, nth, num_found); - } - else { - printf("--> MISSING : %-20s \t -- \t %2d of %2d\n", str, nth, num_found); - } -#endif - - if( nth == num_found ) { - return cur_child; - } - else { - free(cur_child); - return NULL; - } -} - -static int rmaps_lama_find_nth_subtree_match_core(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - int *num_found, - hwloc_obj_type_t hwloc_key, - int depth, - hwloc_obj_t *cur_child) -{ - unsigned i; - bool found = false; - -#if 0 - { - char str[128]; - hwloc_obj_snprintf(str, sizeof(str), hwloc_topo, parent_obj, "#", 0); - printf("--> Checking -- %-20s \t -- \t %2d of %2d\n", str, nth, *num_found); - } -#endif - - /* - * Check if the keys match - */ - if( hwloc_key == parent_obj->type ) { - if( HWLOC_OBJ_CACHE == parent_obj->type && - depth == (int)parent_obj->attr->cache.depth ) { - *num_found += 1; - found = true; - } else { - *num_found += 1; - found = true; - } - } - - /* - * Basecase: - * If we have found the correct item, return - */ - if( nth == *num_found ) { - *cur_child = parent_obj; - return ORTE_SUCCESS; - } - - /* - * Do no go any deeper in the tree than we have to - */ - if( !found ) { - for(i = 0; i < parent_obj->arity; ++i ) { - rmaps_lama_find_nth_subtree_match_core(hwloc_topo, - parent_obj->children[i], - nth, - num_found, - hwloc_key, - depth, - cur_child); - if( nth == *num_found ) { - return ORTE_SUCCESS; - } - } - } - - return ORTE_SUCCESS; -} - -hwloc_obj_t * rmaps_lama_find_parent(hwloc_topology_t hwloc_topo, - hwloc_obj_t *child_obj, - rmaps_lama_level_type_t lama_key) -{ - hwloc_obj_t *cur_parent = NULL; - hwloc_obj_type_t hwloc_key; - int depth; - - /* - * Convert LAMA key to HWLOC key - */ - rmaps_lama_convert_lama_key_to_hwloc_key(lama_key, &hwloc_key, &depth); - - /* - * Sanity check - */ - if( hwloc_key == (*child_obj)->type ) { - if( HWLOC_OBJ_CACHE == (*child_obj)->type && - depth == (int)(*child_obj)->attr->cache.depth ) { - return child_obj; - } else { - return child_obj; - } - } - - cur_parent = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - if (NULL == cur_parent) { - return NULL; - } - - /* - * Accend tree to find mathing parent - */ - *cur_parent = (*child_obj)->parent; - while(NULL != *cur_parent ) { - if( hwloc_key == (*cur_parent)->type ) { - if( HWLOC_OBJ_CACHE == (*cur_parent)->type && - depth == (int)(*cur_parent)->attr->cache.depth ) { - return cur_parent; - } else { - return cur_parent; - } - } - - *cur_parent = (*cur_parent)->parent; - } - - free(cur_parent); - return NULL; -} - - -/********************************* - * Max Tree Structure Functions - *********************************/ -opal_tree_t * rmaps_lama_create_empty_max_tree(void) -{ - opal_tree_t *tmp_tree = NULL; - - tmp_tree = OBJ_NEW(opal_tree_t); - opal_tree_init(tmp_tree, - &lama_max_tree_comp, - &lama_max_tree_serialize, - &lama_max_tree_deserialize, - &lama_max_tree_get_key); - - return tmp_tree; -} - -static int lama_max_tree_comp(opal_tree_item_t *item, void *key) -{ - if( ((rmaps_lama_max_tree_item_t *)item)->type == *((rmaps_lama_level_type_t *)key) ) { - return 0; - } - - return -1; -} - -static int lama_max_tree_serialize(opal_tree_item_t *item, opal_buffer_t *buffer) -{ - opal_dss.pack(buffer, &(((rmaps_lama_max_tree_item_t *)item)->type), 1, OPAL_INT); - - return ORTE_SUCCESS; -} - -static int lama_max_tree_deserialize(opal_buffer_t *buffer, opal_tree_item_t **item) -{ - rmaps_lama_max_tree_item_t *element; - orte_std_cntr_t n = 1; - - element = OBJ_NEW(rmaps_lama_max_tree_item_t); - if( OPAL_SUCCESS == opal_dss.unpack(buffer, &(element->type), &n, OPAL_INT) ) { - *item = (opal_tree_item_t*)element; - } else { - *item = NULL; - } - - return ORTE_SUCCESS; -} - -static void * lama_max_tree_get_key(opal_tree_item_t *item) -{ - return &(((rmaps_lama_max_tree_item_t *)item)->type); -} - - -/********************************* - * Pretty Print Functions - *********************************/ -void rmaps_lama_max_tree_pretty_print_tree(opal_tree_t *tree) -{ - if( NULL == tree ) { - return; - } - - if( opal_tree_is_empty(tree) ) { - return; - } - - pretty_print_subtree(tree, opal_tree_get_root(tree), 0); - - return; -} - -static char * rmaps_lama_max_tree_pretty_print_subtree_element_get(opal_tree_t *tree, - opal_tree_item_t *parent, - int level) -{ - char *element_str = NULL; - char *spacer = NULL; - char *label = NULL; - rmaps_lama_level_type_t *type = NULL; - int i; - - if( NULL == parent ) { - return NULL; - } - - spacer = (char *)malloc(sizeof(char) * (level+1)); - for(i = 0; i < level; ++i ) { - spacer[i] = ' '; - } - spacer[level] = '\0'; - - type = (rmaps_lama_level_type_t *)(tree->get_key(parent)); - label = lama_type_enum_to_str(*type); - - asprintf(&element_str, "%s[%s \t : %3d, %3d", - spacer, label, - parent->opal_tree_num_children, parent->opal_tree_num_ancestors); - - free(spacer); - free(label); - - return element_str; -} - -static void pretty_print_subtree(opal_tree_t *tree, opal_tree_item_t *parent, int level) -{ - opal_tree_item_t *child = NULL; - - if( NULL == parent ) { - return; - } - - /* - * Display Self - */ - pretty_print_subtree_element(tree, parent, level); - - /* - * Depth-first display children - * Basecase; If no children - return - */ - level++; - for(child = opal_tree_get_first_child(parent); - child != NULL; - child = opal_tree_get_next_sibling(child) ) { - pretty_print_subtree(tree, child, level); - } - - return; - -} - -static void pretty_print_subtree_element(opal_tree_t *tree, opal_tree_item_t *parent, int level) -{ - char *element_str = NULL; - - if( NULL == parent ) { - return; - } - - element_str = rmaps_lama_max_tree_pretty_print_subtree_element_get(tree, parent, level); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Tree Element: %s", - element_str); - - free(element_str); - - return; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_module.c b/orte/mca/rmaps/lama/rmaps_lama_module.c deleted file mode 100644 index ceb97bf25b1..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_module.c +++ /dev/null @@ -1,1914 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2012-2017 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2014 Intel, Inc. All rights reserved - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#include - -#include "opal/mca/hwloc/hwloc-internal.h" - -#include "opal/util/argv.h" -#include "opal/class/opal_tree.h" - -#include "orte/util/show_help.h" -#include "orte/util/error_strings.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -#include "orte/runtime/orte_globals.h" - -#include "rmaps_lama.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - - -/********************************* - * Module setup - *********************************/ -static int orte_rmaps_lama_map(orte_job_t *jdata); -orte_rmaps_base_module_t orte_rmaps_lama_module = { - orte_rmaps_lama_map -}; - - -/********************************* - * Timer - *********************************/ -#define RMAPS_LAMA_TIMER_TOTAL 0 -#define RMAPS_LAMA_TIMER_PARSE_PARAMS 1 -#define RMAPS_LAMA_TIMER_BUILD_MAX_TREE 2 -#define RMAPS_LAMA_TIMER_MAPPING 3 -#define RMAPS_LAMA_TIMER_ORDERING 4 -#define RMAPS_LAMA_TIMER_MAX 5 - -static double rmaps_lama_get_time(void); -static void rmaps_lama_set_time(int idx, bool is_start); -static void rmaps_lama_display_all_timers(void); -static void rmaps_lama_clear_timers(void); -static void rmaps_lama_display_indv_timer_core(double diff, char *str); - -static double timer_start[RMAPS_LAMA_TIMER_MAX]; -static double timer_end[RMAPS_LAMA_TIMER_MAX]; -static double timer_accum[RMAPS_LAMA_TIMER_MAX]; - -#define RMAPS_LAMA_CLEAR_TIMERS() \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_clear_timers(); \ - } \ - } -#define RMAPS_LAMA_START_TIMER(idx) \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_set_time(idx, true); \ - } \ - } -#define RMAPS_LAMA_END_TIMER(idx) \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_set_time(idx, false); \ - } \ - } -#define RMAPS_LAMA_DISPLAY_TIMERS() \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_display_all_timers(); \ - } \ - } - - -/********************************* - * Structures & Defines - *********************************/ -static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item); -static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item); - -OBJ_CLASS_INSTANCE(rmaps_lama_hwloc_user_t, - opal_object_t, - rmaps_lama_hwloc_user_construct, - rmaps_lama_hwloc_user_destruct); - - -/********************************* - * Globals - *********************************/ -/* - * Mapping - */ -rmaps_lama_level_type_t *lama_mapping_layout = NULL; -static rmaps_lama_level_type_t *lama_mapping_layout_sort = NULL; -int lama_mapping_num_layouts = 0; - -/* - * Binding - */ -rmaps_lama_level_type_t lama_binding_level = LAMA_LEVEL_UNKNOWN; -static int lama_binding_num_levels = 0; - -/* - * MPPR - */ -rmaps_lama_level_info_t *lama_mppr_levels = NULL; -int lama_mppr_num_levels = 0; - -/* - * Ordering - */ -static rmaps_lama_order_type_t lama_ordering = LAMA_ORDER_NATURAL; - -/* - * Homogeneous system optimization - */ -bool lama_mppr_max_tree_homogeneous_system = false; - - -/********************************* - * Support Macros - *********************************/ - - -/********************************* - * Support functions - *********************************/ -/* - * Preprocess the command line arguments - */ -static int orte_rmaps_lama_process_params(orte_job_t *jdata); - -/* - * Mapping Support: - * Core mapping function - */ -static int orte_rmaps_lama_map_core(orte_job_t *jdata); - -/* - * Mapping Support: - * Recursive function for mapping process - */ -static int rmaps_lama_map_core_iter_level(orte_job_t *jdata, - orte_app_context_t *cur_app_context, - opal_list_t *node_list, - orte_node_t **cur_mach_ptr, - opal_tree_t *max_tree, - int cur_level, - int mach_level, - int **pu_idx_ref, - int **last_pu_idx_ref, - int *num_mapped, - int max_procs, - int *iter_passes); - -/* - * Mapping Support: - * Access the next machine in the node list - */ -static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list, - opal_list_item_t *cur_mach); - -/* - * Mapping Support: - * Check the availability of the requested slot on the specified node - */ -static int check_node_availability(orte_node_t *cur_node, - opal_tree_t *max_tree, - int *pu_idx_ref, - char **slot_list); - -/* - * Mapping Support: - * Debugging PU display - */ -static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc); -static char * pu_ref_to_str(int *ref, int size); - -/* - * Mapping Support: - * Convert the process layout 'layer' to the sorted position for the PU - */ -static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer); - -/* - * MPPR Support: - * Check to make sure a process can be placed on this resource given the - * MPPR restrictions. - */ -static int rmaps_lama_check_mppr(orte_node_t *node, - hwloc_obj_t *child_obj); -static int rmaps_lama_iter_mppr_parents(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only); -static int rmaps_lama_iter_mppr_children(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only); - -/* - * MPPR Support: - * Increment parents of this child to account for a process being placed - * on this resource. - */ -static int rmaps_lama_inc_mppr(orte_node_t *node, - hwloc_obj_t *child_obj); - -/* - * Mapping Support: - * Return the native representation of the slot list - */ -static char * get_native_slot_list(orte_node_t *cur_node, - hwloc_obj_t *pu_obj, - int *put_idx_ref); - -/* - * Ordering Support: - * Reorder sequentially - */ -static int rmaps_lama_ordering_sequential(orte_job_t *jdata); - -/* - * Map a single process to a specific node - */ -static int orte_rmaps_lama_map_process(orte_job_t *jdata, - orte_node_t *node, - int app_idx, - orte_proc_t **proc); - -/********************************* - * Main Module function to map a job - *********************************/ -static int orte_rmaps_lama_map(orte_job_t *jdata) -{ - int ret, exit_status = ORTE_SUCCESS; - mca_base_component_t *loc_comp = &mca_rmaps_lama_component.base_version; - - RMAPS_LAMA_CLEAR_TIMERS(); - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_TOTAL); - - /* - * Sanity Check: - * If we are not the 'chosen' mapper, then exit here - */ - if (NULL != jdata->map->req_mapper && - 0 != strcasecmp(jdata->map->req_mapper, loc_comp->mca_component_name)) { - /* a mapper has been specified, and it isn't me */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: job %s not using lama mapper (using %s)", - ORTE_JOBID_PRINT(jdata->jobid), - jdata->map->req_mapper); - return ORTE_ERR_TAKE_NEXT_OPTION; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - - /* - * Identify this as the mapper responsible for this job - */ - if (NULL != jdata->map->last_mapper) { - free(jdata->map->last_mapper); - } - jdata->map->last_mapper = strdup(loc_comp->mca_component_name); - - /* - * Start at the beginning... - */ - jdata->num_procs = 0; - - /* - * Process the command line arguments - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS); - if( ORTE_SUCCESS != (ret = orte_rmaps_lama_process_params(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS); - - /* - * Actually map the job - */ - if( ORTE_SUCCESS != (ret = orte_rmaps_lama_map_core(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * All Done - */ - - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_TOTAL); - RMAPS_LAMA_DISPLAY_TIMERS(); - - - cleanup: - if( NULL != lama_mapping_layout ) { - free(lama_mapping_layout); - lama_mapping_layout = NULL; - } - - if( NULL != lama_mapping_layout_sort ) { - free(lama_mapping_layout_sort); - lama_mapping_layout_sort = NULL; - } - - if( NULL != lama_mppr_levels ) { - free(lama_mppr_levels); - lama_mppr_levels = NULL; - } - - return exit_status; -} - - -/********************************* - * User defined lookup structure for hwloc topology - *********************************/ -static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item) -{ - item->node_mppr = OBJ_NEW(opal_pointer_array_t); - opal_pointer_array_init(item->node_mppr, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE); -} - -static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item) -{ - orte_std_cntr_t i; - - if( NULL != item->node_mppr ) { - for(i = 0; i < item->node_mppr->size; ++i) { - if( NULL != item->node_mppr->addr[i] ) { - OBJ_RELEASE(item->node_mppr->addr[i]); - item->node_mppr->addr[i] = NULL; - } - } - OBJ_RELEASE(item->node_mppr); - item->node_mppr = NULL; - } -} - - -/********************************* - * Command line parameter parsing functions - *********************************/ -static int orte_rmaps_lama_process_params(orte_job_t *jdata) -{ - int ret, i; - char *type_str = NULL; - - /* - * Process map/bind/order/mppr aliases. It will print its own - * error message if something went wrong. - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_process_alias_params(jdata) ) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Parse: Binding. It will print its own error message if - * something goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Binding : [%s]", - rmaps_lama_cmd_bind); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_binding(rmaps_lama_cmd_bind, - &lama_binding_level, - &lama_binding_num_levels)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - type_str = lama_type_enum_to_str(lama_binding_level); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Binding : %*d x %10s", - MAX_BIND_DIGIT_LEN, lama_binding_num_levels, type_str); - free(type_str); - type_str = NULL; - } - /* Reset the binding option since we are going to do it ourselves */ - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - - /* - * Parse: Mapping from Process Layout string. It will print its - * own error message if something goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Mapping : [%s]", - rmaps_lama_cmd_map); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mapping(rmaps_lama_cmd_map, - &lama_mapping_layout, - &lama_mapping_layout_sort, - &lama_mapping_num_layouts)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - type_str = lama_type_enum_to_str(lama_mapping_layout[i]); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Mapping : (%d) %10s (%d vs %d)", - i, type_str, - lama_mapping_layout[i], lama_mapping_layout_sort[i]); - free(type_str); - type_str = NULL; - } - } - - /* - * Parse: MPPR. It will print its own error message if something - * goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- MPPR : [%s]", - rmaps_lama_cmd_mppr); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mppr(rmaps_lama_cmd_mppr, - &lama_mppr_levels, - &lama_mppr_num_levels)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - for( i = 0; i < lama_mppr_num_levels; ++i ) { - type_str = lama_type_enum_to_str(lama_mppr_levels[i].type); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- MPPR : %*d at %10s", - MAX_BIND_DIGIT_LEN, lama_mppr_levels[i].max_resources, type_str); - free(type_str); - type_str = NULL; - } - } - - /* - * Parse: Ordering - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Ordering : [%s]", - rmaps_lama_cmd_ordering); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_ordering(rmaps_lama_cmd_ordering, - &lama_ordering)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - if( LAMA_ORDER_NATURAL == lama_ordering ) { - type_str = strdup("Natural"); - } - else if( LAMA_ORDER_SEQ == lama_ordering ) { - type_str = strdup("Sequential"); - } - else { - type_str = strdup("Unknown"); - } - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Ordering : %10s", - type_str); - free(type_str); - type_str = NULL; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - return ORTE_SUCCESS; -} - - -/********************************* - * Support functions - *********************************/ -rmaps_lama_level_type_t lama_type_str_to_enum(char *param) -{ - if( 0 == strncmp(param, "n", strlen("n")) ) { - return LAMA_LEVEL_MACHINE; - } - else if( 0 == strncmp(param, "b", strlen("b")) ) { - return LAMA_LEVEL_BOARD; - } - else if( 0 == strncmp(param, "s", strlen("s")) ) { - return LAMA_LEVEL_SOCKET; - } - else if( 0 == strncmp(param, "c", strlen("c")) ) { - return LAMA_LEVEL_CORE; - } - else if( 0 == strncmp(param, "h", strlen("h")) ) { - return LAMA_LEVEL_PU; - } - else if( 0 == strncmp(param, "L1", strlen("L1")) ) { - return LAMA_LEVEL_CACHE_L1; - } - else if( 0 == strncmp(param, "L2", strlen("L2")) ) { - return LAMA_LEVEL_CACHE_L2; - } - else if( 0 == strncmp(param, "L3", strlen("L3")) ) { - return LAMA_LEVEL_CACHE_L3; - } - else if( 0 == strncmp(param, "N", strlen("N")) ) { - return LAMA_LEVEL_NUMA; - } - - return LAMA_LEVEL_UNKNOWN; -} - -char * lama_type_enum_to_str(rmaps_lama_level_type_t param) -{ - if( LAMA_LEVEL_MACHINE == param ) { - return strdup("Machine"); - } - else if( LAMA_LEVEL_BOARD == param ) { - return strdup("Board"); - } - else if( LAMA_LEVEL_SOCKET == param ) { - return strdup("Socket"); - } - else if( LAMA_LEVEL_CORE == param ) { - return strdup("Core"); - } - else if( LAMA_LEVEL_PU == param ) { - return strdup("Hw. Thread"); - } - else if( LAMA_LEVEL_CACHE_L1 == param ) { - return strdup("L1 Cache"); - } - else if( LAMA_LEVEL_CACHE_L2 == param ) { - return strdup("L2 Cache"); - } - else if( LAMA_LEVEL_CACHE_L3 == param ) { - return strdup("L3 Cache"); - } - else if( LAMA_LEVEL_NUMA == param ) { - return strdup("NUMA"); - } - - return strdup("Unknown"); -} - -/********************************* - * Core Mapper function - *********************************/ -static int orte_rmaps_lama_map_core(orte_job_t *jdata) -{ - int ret, exit_status = ORTE_SUCCESS; - int cur_app_idx = 0; - int num_slots; - orte_app_context_t *cur_app_context = NULL; - orte_node_t *cur_mach = NULL; - orte_node_t **cur_mach_ptr = NULL; - orte_proc_t *proc = NULL; - opal_list_t *node_list = NULL; - opal_list_item_t *item = NULL; - opal_tree_t *max_tree = NULL; - int *pu_idx_ref = NULL; - int *last_pu_idx_ref = NULL; - int i, num_mapped, last_num_mapped, mach_level = -1; - orte_std_cntr_t j; - int max_procs_to_map; - int iter_passes; - char * last_level_str = NULL; - bool initial_map = true; - - /* - * Setup PU reference - * Find the position of the 'machine' - */ - pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts); - if (NULL == pu_idx_ref) { - return ORTE_ERROR; - } - last_pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts); - if (NULL == last_pu_idx_ref) { - free(pu_idx_ref); - return ORTE_ERROR; - } - - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - pu_idx_ref[i] = 0; - last_pu_idx_ref[i] = -1; - if( LAMA_LEVEL_MACHINE == lama_mapping_layout[i] ) { - mach_level = i; - } - } - - /* - * Foreach app context - */ - for(cur_app_idx = 0; cur_app_idx < jdata->apps->size; ++cur_app_idx ) { - if( NULL == (cur_app_context = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, cur_app_idx))) { - continue; - } - - /* - * Get the list of nodes for this app_context. - */ - node_list = OBJ_NEW(opal_list_t); - ret = orte_rmaps_base_get_target_nodes(node_list, - &num_slots, - cur_app_context, - jdata->map->mapping, - initial_map, false); - if(ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - /* Flag that all subsequent requests should not reset the node->mapped flag */ - initial_map = false; - - /* - * If a bookmark exists from some prior mapping, then start from there - */ - cur_mach = (orte_node_t*)orte_rmaps_base_get_starting_point(node_list, jdata); - - /* - * If the application did not specify the number of procs - * then set it to the number of 'slots' - * JJH: TODO: Revisit 'max_procs' calculation - */ - if (0 == cur_app_context->num_procs) { - cur_app_context->num_procs = num_slots; - } - max_procs_to_map = cur_app_context->num_procs; - - /* - * Build the Max Tree - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE); - max_tree = rmaps_lama_create_empty_max_tree(); - if( ORTE_SUCCESS != (ret = rmaps_lama_build_max_tree(jdata, node_list, - max_tree, - &lama_mppr_max_tree_homogeneous_system)) ) { - exit_status = ret; - goto cleanup; - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE); - - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: -----------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_MAPPING); - - /* - * Clear PU reference - */ - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - pu_idx_ref[i] = 0; - } - - /* - * Mapping: Recursively loop over all levels - */ - num_mapped = 0; - last_num_mapped = 0; - iter_passes = 0; - cur_mach_ptr = (orte_node_t**)malloc(sizeof(orte_node_t*)); - *cur_mach_ptr = cur_mach; - while( max_procs_to_map > num_mapped ) { - ret = rmaps_lama_map_core_iter_level(jdata, - cur_app_context, - node_list, - cur_mach_ptr, - max_tree, - lama_mapping_num_layouts-1, - mach_level, - &pu_idx_ref, - &last_pu_idx_ref, - &num_mapped, - max_procs_to_map, - &iter_passes); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * We only get here (without finishing the mapping) if we are going to - * start oversubscribing resources. - */ - if( max_procs_to_map > num_mapped ) { - if( !rmaps_lama_can_oversubscribe ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:oversubscribe", - true, - num_mapped, max_procs_to_map); - exit_status = ORTE_ERROR; - goto cleanup; - } else { - rmaps_lama_am_oversubscribing = true; - } - } - - /* - * Check to see if we have made any progress in the mapping loop - */ - if( 0 < cur_app_idx && 2 == iter_passes ) { - /* - * Give it another pass: - * This is an edge case when we are trying to restart from a - * bookmark left by a previous app context. If this app context - * is starting from exactly the beginning of the allocation - * then the recursive loop could return out here after the - * increment pass. This is indicated by (iter_passes = 2). - * Since no processes were mapped, we just try again. - */ - } - else if( last_num_mapped == num_mapped ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:no-resources-available", - true, - cur_app_idx, - num_mapped, max_procs_to_map, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } else { - last_num_mapped = num_mapped; - } - } - - /* - * Display Bookmark for debugging - */ - last_level_str = pu_ref_to_str(last_pu_idx_ref, lama_mapping_num_layouts); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bookmark: --> Node %10s PU %10s", - jdata->bookmark->name, last_level_str); - free(last_level_str); - last_level_str = NULL; - - /* - * Clenup for next iteration - */ - if( NULL != node_list ) { - while(NULL != (item = opal_list_remove_first(node_list))) { - OBJ_RELEASE(item); - } - OBJ_RELEASE(node_list); - node_list = NULL; - } - - OBJ_RELEASE(max_tree); - max_tree = NULL; - } - - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_MAPPING); - - - /* - * Ordering - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_ORDERING); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - if( LAMA_ORDER_SEQ == lama_ordering ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Sequential ------------"); - - if( ORTE_SUCCESS != (ret = rmaps_lama_ordering_sequential(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Natural ---------------"); -#if 0 - /* - * We compute our own vpids inline with the algorithm. So no need to use the - * orte_rmaps_base_compute_vpids() function. - */ -#endif - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_ORDERING); - - - /* - * Display Mapping - */ - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - char *cpu_bitmap; - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - for( j = 0; j < jdata->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - cpu_bitmap = NULL; - orte_get_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, (void**)&cpu_bitmap, OPAL_STRING); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Proc. %2d on Node %10s - Slot %s", - proc->name.vpid, proc->node->name, cpu_bitmap); - if (NULL != cpu_bitmap) { - free(cpu_bitmap); - } - } - } - - - /* - * All done - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Finished ------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - - cleanup: - if( NULL != node_list ) { - while(NULL != (item = opal_list_remove_first(node_list))) { - OBJ_RELEASE(item); - } - OBJ_RELEASE(node_list); - } - - if( NULL != max_tree ) { - OBJ_RELEASE(max_tree); - } - - free(pu_idx_ref); - free(last_pu_idx_ref); - - if( NULL != last_level_str ) { - free(last_level_str); - } - - return exit_status; -} - -static int rmaps_lama_map_core_iter_level(orte_job_t *jdata, - orte_app_context_t *cur_app_context, - opal_list_t *node_list, - orte_node_t **cur_mach_ptr, - opal_tree_t *max_tree, - int cur_level, - int mach_level, - int **pu_idx_ref, - int **last_pu_idx_ref, - int *num_mapped, - int max_procs, - int *iter_passes) -{ - int ret, exit_status = ORTE_SUCCESS; - int i, j; - opal_tree_item_t *tree_for_level = NULL; - int max_subtree_arity = 0; - char * level_str = NULL; - char * last_level_str = NULL; - char * slot_list = NULL; - orte_proc_t *proc = NULL; - int pu_idx = 0; - - /* - * Find the current tree for this level - * If it is the machine level, then we need to access the information from - * the node list, not the max_tree. - */ - if( cur_level != mach_level ) { - tree_for_level = opal_tree_find_with(opal_tree_get_root(max_tree), - &lama_mapping_layout[cur_level]); - /* - * We do not need subtree, but the arity of the subtree - * JJH TODO: This should be an opal_tree function. - */ - max_subtree_arity = 1; /* include self */ - while( NULL != (tree_for_level = opal_tree_get_next_sibling(tree_for_level)) ) { - ++max_subtree_arity; - } - } - else if( NULL == *cur_mach_ptr ) { - *cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr)); - } - - pu_idx = convert_layer_to_sort_idx(lama_mapping_layout[cur_level]); - level_str = lama_type_enum_to_str(lama_mapping_layout[cur_level]); - - /* - * Do we need to advance to a bookmark - */ - if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - /* - * Display last mapped - */ - last_level_str = pu_ref_to_str(*last_pu_idx_ref, lama_mapping_num_layouts); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bookmark: --> Last Mapped: Node %10s (bkmrk %10s) PU %10s - Level %2d", - (NULL == *cur_mach_ptr ? "(NULL)" : (*cur_mach_ptr)->name), - jdata->bookmark->name, last_level_str, (*last_pu_idx_ref)[pu_idx]); - free(last_level_str); - last_level_str = NULL; - - /* - * Set the level starting point to the last known index - */ - i = (*last_pu_idx_ref)[pu_idx]; - } else { - i = 0; - } - - - /* - * Loop over all siblings at this level - * Initial condition above, Increment at bottom, Break check at bottom - */ - while( 1 ) { - /* - * Define the PU index - */ - (*pu_idx_ref)[pu_idx] = i; - - if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s - Increment only", - cur_level+1, - level_str, pu_idx, i, max_subtree_arity, - (NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name)); - } else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s", - cur_level+1, - level_str, pu_idx, i, max_subtree_arity, - (NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name)); - } - - - /* - * If not the inner most loop, iterate to the next level down - */ - if( cur_level > 0 ) { - ret = rmaps_lama_map_core_iter_level(jdata, - cur_app_context, - node_list, - cur_mach_ptr, - max_tree, - cur_level - 1, - mach_level, - pu_idx_ref, - last_pu_idx_ref, - num_mapped, - max_procs, - iter_passes); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - /* - * If we are restarting the iteration from a previous bookmark then - * the first pass through is a no-op mapping pass that just increments - * the PU reference. - * Called by innermost loop - */ - else if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - *iter_passes += 1; - } - /* - * Try to map at this location - */ - else { - /* - * On first pass, make sure we increment this, just so we do not - * accidentally think this is an increment pass. - */ - if( 0 == *iter_passes ) { - *iter_passes += 1; - } - - /* - * Display the PU ref for debugging - */ - display_pu_ref(*pu_idx_ref, lama_mapping_num_layouts, *num_mapped, proc); - - - /* - * Check to see if this resource is available on this node. - * - * In a heterogeneous or otherwise non-uniformly restricted - * environment we may iterate to a resource that is not - * available either because it does not exist, or is not - * available for allocation (off-lined, sub-node allocation). - * Additionally, we need to check resource constrains expressed - * in the MPPR and binding. - */ - ret = check_node_availability((*cur_mach_ptr), - max_tree, - *pu_idx_ref, - &slot_list); - if( ORTE_SUCCESS != ret || NULL == slot_list ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:hwtopo: Mapping: --> Level %2d: %s - INVALID/SKIP", - cur_level+1, - level_str); - /* - * By not mapping here we just let the iterations continue - * until a suitable match is found or we have exhausted all - * possible locations to match and thus cannot map any more. - */ - } - else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %s - Slot List (%s)", - cur_level+1, - level_str, slot_list); - - /* - * Map this process onto the resource specified - * level_tree_objs[*] and cur_mach point to the specific resource - */ - proc = NULL; - ret = orte_rmaps_lama_map_process(jdata, - (*cur_mach_ptr), - cur_app_context->idx, - &proc); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto bailout; - } - - /* - * Set the binding for this process - */ - orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, slot_list, OPAL_STRING); - /* - * Insert the proc into the 'native' ordering location. - */ - proc->name.vpid = jdata->num_procs; - if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(jdata->procs, - proc->name.vpid, proc))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - jdata->num_procs += 1; - - /* - * Save a bookmark so we can return here later if necessary - */ - for( j = 0; j < lama_mapping_num_layouts; ++j ) { - (*last_pu_idx_ref)[j] = (*pu_idx_ref)[j]; - } - jdata->bookmark = (orte_node_t*)(*cur_mach_ptr); - - (*num_mapped)++; - } - } - - /* - * Increment loop - * - * If we are binding, then we may need to advance the binding layer - * by more than one. - */ - if( cur_level != mach_level ) { - if( lama_binding_level == lama_mapping_layout[cur_level] ) { - i += lama_binding_num_levels; - } else { - ++i; - } - } else { - /* - * Note: Currently we do not allow for 'binding' to multiple machines - * But keep the code just in case we want to play with 'stride' later - */ - if( lama_binding_level == lama_mapping_layout[cur_level] && lama_binding_num_levels > 1) { - opal_output(0, "mca:rmaps:lama: ERROR: Cannot bind to multiple machines - SHOULD NEVER HAPPEN: %s", - rmaps_lama_cmd_bind); - exit_status = ORTE_ERROR; - goto bailout; -#if 0 - for( j = 0; j < lama_binding_num_levels; ++j ) { - cur_mach = get_next_machine(jdata, node_list, (opal_list_item_t*)cur_mach); - if( NULL == cur_mach ) { - break; - } - ++i; - } -#endif - } else { - *cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr)); - ++i; - } - } - - /* - * Check if we are done mapping before iterating again - */ - if( max_procs <= *num_mapped ) { - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * Check if we are done looping - */ - if( cur_level != mach_level ) { - if( i >= max_subtree_arity ) { - break; - } - } else { - if( NULL == *cur_mach_ptr ) { - break; - } - } - } - - - /* - * Sanity Check: Check if we are done mapping - */ - if( max_procs <= *num_mapped ) { - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - cleanup: - /* - * If the outermost layer, the increment the number of iteration passes. - */ - if( cur_level == lama_mapping_num_layouts-1 ) { - *iter_passes += 1; - } - - bailout: - if( NULL != level_str ) { - free(level_str); - level_str = NULL; - } - - if( NULL != slot_list ) { - free(slot_list); - slot_list = NULL; - } - - return exit_status; -} - -static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list, - opal_list_item_t *cur_mach) -{ - orte_node_t *next_mach = NULL; - - if( NULL == cur_mach ) { - next_mach = (orte_node_t*)opal_list_get_first(node_list); - } - else if( opal_list_get_last(node_list) == cur_mach ) { - next_mach = NULL; - } - else { - next_mach = (orte_node_t*)opal_list_get_next(cur_mach); - } - - return next_mach; -} - -static int orte_rmaps_lama_map_process(orte_job_t *jdata, - orte_node_t *node, - int app_idx, - orte_proc_t **proc) -{ - int ret; - - /* - * Add this node to the map, but only once - */ - if( !ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED) ) { - if (ORTE_SUCCESS > (ret = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { - ORTE_ERROR_LOG(ret); - return ret; - } - ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); - OBJ_RETAIN(node); /* maintain accounting on object */ - ++(jdata->map->num_nodes); - } - - /* - * Setup the process object - */ - if (NULL == (*proc = orte_rmaps_base_setup_proc(jdata, node, app_idx))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_ordering_sequential(orte_job_t *jdata) -{ - orte_job_map_t *map; - orte_proc_t *proc = NULL, *swap = NULL; - orte_std_cntr_t i, j; - int cur_rank = 0; - orte_node_t *cur_node = NULL; - - map = jdata->map; - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - /* - * Assign the ranks sequentially - */ - for( i = 0; i < map->nodes->size; ++i) { - if (NULL == (cur_node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { - continue; - } - for( j = 0; j < cur_node->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(cur_node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - continue; - } - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Rename Proc. %2d to %2d (Rev. %s)", - proc->name.vpid, cur_rank, proc->node->name); - proc->name.vpid = cur_rank; - ++cur_rank; - } - } - - /* - * Fix the job structure ordering - Sort by new vpid - * - * If we do not do this then the remote daemons assign the incorrect - * ranks to the processes since they use the relative ordering in the - * jdata->procs structure to determine vpids locally. - * - * JJH: Look at combining these loops with the loop in the core so we - * JJH: do not have to iterate over the list two times - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - cur_rank = 0; - for( j = 0; j < jdata->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Proc. %2d on Node %s", - proc->name.vpid, proc->node->name); - - while((int)proc->name.vpid != cur_rank ) { - swap = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid); - - opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); - opal_pointer_array_set_item(jdata->procs, cur_rank, swap); - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: \t SWAP Proc. %2d (%d) and Proc. %2d (%d)", - proc->name.vpid, cur_rank, swap->name.vpid, proc->name.vpid); - proc = swap; - } - ++cur_rank; - } - - return ORTE_SUCCESS; -} - -static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer) -{ - int i; - - for(i = 0; i < lama_mapping_num_layouts; ++i ) { - if( lama_mapping_layout_sort[i] == layer ) { - return i; - } - } - - return 0; -} - -static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc) -{ - char *str = NULL; - - str = pu_ref_to_str(ref, size); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: PU Ref: %s [Rank %2d] Name: %s", - str, rank, - (NULL == proc ? "(null)" : ORTE_NAME_PRINT(&proc->name))); - - free(str); - - return; -} - -static char * pu_ref_to_str(int *ref, int size) -{ - int i, idx; - char *str = NULL; - - str = (char *)malloc(sizeof(char) * (2 * size)); - for(i = 0, idx = 0; i < size; ++i, idx += 2) { - sprintf(&(str[idx]), "%2d", ref[i]); - } - - return str; -} - -static int check_node_availability(orte_node_t *cur_node, - opal_tree_t *max_tree, - int *pu_idx_ref, - char **slot_list) -{ - int exit_status = ORTE_SUCCESS; - int i; - char * level_str = NULL; - hwloc_obj_t *topo_child = NULL, *topo_parent, *topo_allocated; - - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Checking: Node (%s) -------------", - cur_node->name); - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - - /* - * Determine if the current node has the necessary hardware - * as described by the PU index. - * Find the hwloc object reference for the resource pointed to - * by the PU index. - * JJH TODO: If homogeneous system then this could be simplified. - */ - topo_allocated = topo_parent = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - if (NULL == topo_parent) { - return ORTE_ERROR; - } - *topo_parent = hwloc_get_obj_by_depth(cur_node->topology, 0, 0); - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - /* - * Skip 'machine' level - */ - if( LAMA_LEVEL_MACHINE == lama_mapping_layout_sort[i] ) { - continue; - } - /* - * Skip 'board' level - * JJH: HWLOC does not support BOARD at the moment - */ - if( LAMA_LEVEL_BOARD == lama_mapping_layout_sort[i] ) { - continue; - } - - level_str = lama_type_enum_to_str(lama_mapping_layout_sort[i]); - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Checking: %2d of %s", - pu_idx_ref[i], level_str); - - /* - * Find the nth subtree matching the current key - */ - topo_child = rmaps_lama_find_nth_subtree_match(cur_node->topology, - *topo_parent, - pu_idx_ref[i], - lama_mapping_layout_sort[i]); - - /* - * If it does not exist, then this node is not capable of matching - * so it is unavailable. - */ - if( NULL == topo_child ) { - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check failed: Node %s does not have a %10s %2d", - cur_node->name, level_str, pu_idx_ref[i]); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Keep decending the tree - */ - topo_parent = topo_child; - free(level_str); - level_str = NULL; - } - - /* - * We have sufficient hardware :) - */ - - - /* - * Return the native slot list to bind to - * Internally checks the MPPR - */ - *slot_list = get_native_slot_list(cur_node, topo_parent, pu_idx_ref); - if( NULL == *slot_list ) { - goto cleanup; - } - - cleanup: - if( NULL != level_str ) { - free(level_str); - level_str = NULL; - } - - if( ORTE_SUCCESS != exit_status ) { - if( NULL != *slot_list ) { - free(*slot_list); - *slot_list = NULL; - } - } - - free(topo_allocated); - - return exit_status; -} - -static int rmaps_lama_check_mppr(orte_node_t *node, - hwloc_obj_t *child_obj) -{ - int ret; - - /* - * Optimization if no MPPR provided - */ - if( NULL == lama_mppr_levels ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: No MPPR to check - Skip..."); - return ORTE_SUCCESS; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - /* - * Check Parents (excluding self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, true)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - - /* - * Check Children (including self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, true)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_inc_mppr(orte_node_t *node, - hwloc_obj_t *child_obj) -{ - int ret; - - /* - * Optimization if no MPPR provided - */ - if( NULL == lama_mppr_levels ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: No MPPR to increment - Skip..."); - return ORTE_SUCCESS; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - /* - * Increment Parents (excluding self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, false)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - - /* - * Increment Children (including self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, false)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_iter_mppr_parents(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only) -{ - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - char str[128]; - - /* - * Basecase - */ - if( NULL == *child_obj ) { - return ORTE_SUCCESS; - } - - /* - * Check self - */ - /* - * Access MPPR info for this object - */ - hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata; - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - - hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: %s: P [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)", - (check_only ? "Checking " : "Increment"), - node->index, node->name, str, - mppr_accounting->max, - (check_only ? mppr_accounting->cur : mppr_accounting->cur + 1), - (rmaps_lama_am_oversubscribing ? "T" : "F"), - (rmaps_lama_can_oversubscribe ? "T" : "F") ); - - /* - * Check limits - Error on first to exceed - */ - if( check_only ) { - if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) { - if( (mppr_accounting->cur)+1 > mppr_accounting->max ) { - return ORTE_ERROR; - } - } - } - /* - * Increment current number allocated below this level - */ - else { - mppr_accounting->cur += 1; - } - - /* - * Go to parent - */ - return rmaps_lama_iter_mppr_parents(node, &((*child_obj)->parent), check_only); -} - -static int rmaps_lama_iter_mppr_children(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only) -{ - int ret; - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - char str[128]; - int i; - - /* - * Check self - */ - /* - * Access MPPR info for this object - */ - hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata; - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - - hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: %s: C [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)", - (check_only ? "Checking " : "Increment"), - node->index, node->name, str, - mppr_accounting->max, - (check_only ? mppr_accounting->cur : mppr_accounting->cur + 1), - (rmaps_lama_am_oversubscribing ? "T" : "F"), - (rmaps_lama_can_oversubscribe ? "T" : "F") ); - - /* - * Check limits - Error on first to exceed - */ - if( check_only ) { - if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) { - if( (mppr_accounting->cur)+1 > mppr_accounting->max ) { - return ORTE_ERROR; - } - } - } - /* - * Increment current number allocated below this level - */ - else { - mppr_accounting->cur += 1; - } - - /* - * Check all children - */ - for(i = 0; i < (int)(*child_obj)->arity; ++i ) { - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, &((*child_obj)->children[i]), check_only)) ) { - return ret; - } - } - - return ORTE_SUCCESS; -} - - -static char * get_native_slot_list(orte_node_t *cur_node, hwloc_obj_t *pu_obj, int *put_idx_ref) -{ - int i; - char *slot_list = NULL; - hwloc_obj_t *binding_parent = NULL; - hwloc_obj_t *cur_parent = NULL; - hwloc_cpuset_t binding_cpuset; - hwloc_cpuset_t scratch_cpuset; - char *type_str = NULL; - - /* - * Sanity check - */ - if( NULL == pu_obj ) { - return NULL; - } - - /* - * Determine the cpumask to send to the backend for binding - */ - - /* - * Iterate up the tree until we reach the binding parent - */ - binding_parent = rmaps_lama_find_parent(cur_node->topology, pu_obj, lama_binding_level); - if( NULL == binding_parent ) { - return NULL; - } - - /* - * Iterate across cousins until we find enough resources or hit the node boundary - */ - binding_cpuset = hwloc_bitmap_alloc(); - hwloc_bitmap_zero(binding_cpuset); - - scratch_cpuset = hwloc_bitmap_alloc(); - - cur_parent = binding_parent; - - for(i = 0; i < lama_binding_num_levels; ++i) { - /* - * Check MPPR Availability - */ - if( ORTE_SUCCESS != rmaps_lama_check_mppr(cur_node, cur_parent) ) { - goto cleanup; - } - - /* - * Accumulate the bitmask - * - * JJH: TODO: Add resource offline check (?) - */ - hwloc_bitmap_zero(scratch_cpuset); - /* JJH: Maybe use opal_hwloc_base_get_available_cpus(cur_node->topology, (*cur_parent)) ? - * They do pretty much the same thing, but with more checks... - */ - hwloc_bitmap_and(scratch_cpuset, (*cur_parent)->allowed_cpuset, (*cur_parent)->online_cpuset); - hwloc_bitmap_or(binding_cpuset, scratch_cpuset, binding_cpuset); - -#if 0 - { - hwloc_obj_snprintf(str, sizeof(str), cur_node->topology, *cur_parent, "#", 0); - printf("--> BINDING TO -- %-20s \t -- %2d of %2d -- %2d vs %2d\n",str, - i, lama_binding_level, - (*binding_parent)->logical_index, (*cur_parent)->logical_index); - - hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->allowed_cpuset ); - printf("--> CPU A : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->online_cpuset ); - printf("--> CPU B : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), scratch_cpuset); - printf("--> CPU C : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), binding_cpuset); - printf("--> CPU D : %-20s\n", str); - } -#endif - - /* - * Iterate to the next cousin. - * If we exceed the boundary of the node, then send up an error. - */ - if( (i+1) < lama_binding_num_levels && NULL == (*cur_parent)->next_cousin ) { - type_str = lama_type_enum_to_str(lama_binding_level); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Error: Not able to bind to %*d x %10s - Stopped at %*d", - MAX_BIND_DIGIT_LEN, lama_binding_num_levels, - type_str, - MAX_BIND_DIGIT_LEN, i); - free(type_str); - type_str = NULL; - goto cleanup; - } - /* - * Point to the next cousin - */ - if( NULL != (*cur_parent)->next_cousin ) { - cur_parent = &((*cur_parent)->next_cousin); - } - } - - /* - * Account for the process placement in the MPPR - * Assumes a previous check - * We cannot do this in the loop, since if the MPPR check fails we would - * need to roll back previous increments. - */ - cur_parent = binding_parent; - for(i = 0; i < lama_binding_num_levels; ++i) { - /* - * Account for the process placement in the MPPR - * Assumes a previous check. - */ - if( ORTE_SUCCESS != rmaps_lama_inc_mppr(cur_node, cur_parent) ) { - goto cleanup; - } - - /* - * Point to the next cousin - */ - if( NULL != (*cur_parent)->next_cousin ) { - cur_parent = &((*cur_parent)->next_cousin); - } - } - - /* - * Convert the cpuset to a slot_list for the remote daemon - */ - hwloc_bitmap_list_asprintf(&slot_list, binding_cpuset); - - cleanup: - hwloc_bitmap_free(scratch_cpuset); - hwloc_bitmap_free(binding_cpuset); - free(binding_parent); - - return slot_list; -} - - -/********************************* - * Timer Support - *********************************/ -static double rmaps_lama_get_time(void) -{ - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static void rmaps_lama_set_time(int idx, bool is_start) -{ - if(idx < RMAPS_LAMA_TIMER_MAX ) { - if( is_start ) { - timer_start[idx] = rmaps_lama_get_time(); - } else { - timer_end[idx] = rmaps_lama_get_time(); - timer_accum[idx] += timer_end[idx] - timer_start[idx]; - } - } -} - -static void rmaps_lama_display_all_timers(void) -{ - double diff = 0.0; - double total = 0.0; - char * label = NULL; - - opal_output(0, - "mca:rmaps:lama: Timing: ---------------------------\n"); - - /* - * Timer: Parse Parameters - */ - label = strdup("Parse Params"); - diff = timer_accum[RMAPS_LAMA_TIMER_PARSE_PARAMS]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Build Max Tree - */ - label = strdup("Build Max Tree"); - diff = timer_accum[RMAPS_LAMA_TIMER_BUILD_MAX_TREE]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Mapping - */ - label = strdup("Mapping"); - diff = timer_accum[RMAPS_LAMA_TIMER_MAPPING]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Ordering - */ - label = strdup("Ordering"); - diff = timer_accum[RMAPS_LAMA_TIMER_ORDERING]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Total Overhead - */ - label = strdup("Other Overhead"); - diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL]; - rmaps_lama_display_indv_timer_core(diff - total, label); - free(label); - - /* - * Timer: Total - */ - label = strdup("Total"); - diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - - opal_output(0, - "mca:rmaps:lama: ---------------------------------"); -} - -static void rmaps_lama_clear_timers(void) -{ - int i; - for(i = 0; i < RMAPS_LAMA_TIMER_MAX; ++i) { - timer_start[i] = 0.0; - timer_end[i] = 0.0; - timer_accum[i] = 0.0; - } -} - - -static void rmaps_lama_display_indv_timer_core(double diff, char *str) -{ - double perc = 0; - double total = 0; - - total = timer_end[RMAPS_LAMA_TIMER_TOTAL] - timer_start[RMAPS_LAMA_TIMER_TOTAL]; - perc = (diff/total) * 100; - - opal_output(0, - "mca:rmaps:lama: \t%-20s = %10.2f ms\t%6.2f %s\n", - str, (diff * 1000), perc, "%"); - return; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_params.c b/orte/mca/rmaps/lama/rmaps_lama_params.c deleted file mode 100644 index 6a54b4ba340..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_params.c +++ /dev/null @@ -1,878 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * Processing for command line interface options - * - */ -#include "rmaps_lama.h" - -#include "opal/util/argv.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" -#include "orte/util/show_help.h" - -#include - -/********************************* - * Local Functions - *********************************/ -/* - * QSort: Integer comparison - */ -static int lama_parse_int_sort(const void *a, const void *b); - -/* - * Convert the '-ppr' syntax from the 'ppr' component to the 'lama' '-mppr' syntax. - */ -static char * rmaps_lama_covert_ppr(char * given_ppr); - -/********************************* - * Parsing Functions - *********************************/ -int rmaps_lama_process_alias_params(orte_job_t *jdata) -{ - int exit_status = ORTE_SUCCESS; - - /* - * Mapping options - * Note: L1, L2, L3 are not exposed in orterun to the user, so - * there is no need to specify them here. - */ - if( NULL == rmaps_lama_cmd_map ) { - /* orte_rmaps_base.mapping */ - switch( ORTE_GET_MAPPING_POLICY(jdata->map->mapping) ) { - case ORTE_MAPPING_BYNODE: - /* rmaps_lama_cmd_map = strdup("nbNsL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("nbsch"); - break; - case ORTE_MAPPING_BYBOARD: - /* rmaps_lama_cmd_map = strdup("bnNsL3L2L1ch"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "by board", "mapping by board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - case ORTE_MAPPING_BYNUMA: - /* rmaps_lama_cmd_map = strdup("NbnsL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("Nbnsch"); - break; - case ORTE_MAPPING_BYSOCKET: - /* rmaps_lama_cmd_map = strdup("sNbnL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("sbnch"); - break; - case ORTE_MAPPING_BYL3CACHE: - rmaps_lama_cmd_map = strdup("L3sNbnL2L1ch"); - break; - case ORTE_MAPPING_BYL2CACHE: - rmaps_lama_cmd_map = strdup("L2sNbnL1ch"); - break; - case ORTE_MAPPING_BYL1CACHE: - rmaps_lama_cmd_map = strdup("L1sNbnch"); - break; - case ORTE_MAPPING_BYCORE: - case ORTE_MAPPING_BYSLOT: - /* rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); */ - rmaps_lama_cmd_map = strdup("csbnh"); - break; - case ORTE_MAPPING_BYHWTHREAD: - /* rmaps_lama_cmd_map = strdup("hcL1L2L3sNbn"); */ - rmaps_lama_cmd_map = strdup("hcsbn"); - break; - case ORTE_MAPPING_RR: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "round robin", "mapping by round robin not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - case ORTE_MAPPING_SEQ: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "sequential", "mapping by sequential not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - case ORTE_MAPPING_BYUSER: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "by user", "mapping by user not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - default: - /* - * Default is map-by core - */ - rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); - break; - } - } - - /* - * Binding Options - */ - if( NULL == rmaps_lama_cmd_bind ) { - /* - * No binding specified, use default - */ - if( !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) || - !OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) || - OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { - rmaps_lama_cmd_bind = NULL; - } - - switch( OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { - case OPAL_BIND_TO_BOARD: - /* rmaps_lama_cmd_bind = strdup("1b"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - "by board", "binding to board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - case OPAL_BIND_TO_NUMA: - rmaps_lama_cmd_bind = strdup("1N"); - break; - case OPAL_BIND_TO_SOCKET: - rmaps_lama_cmd_bind = strdup("1s"); - break; - case OPAL_BIND_TO_L3CACHE: - rmaps_lama_cmd_bind = strdup("1L3"); - break; - case OPAL_BIND_TO_L2CACHE: - rmaps_lama_cmd_bind = strdup("1L2"); - break; - case OPAL_BIND_TO_L1CACHE: - rmaps_lama_cmd_bind = strdup("1L1"); - break; - case OPAL_BIND_TO_CORE: - rmaps_lama_cmd_bind = strdup("1c"); - break; - case OPAL_BIND_TO_HWTHREAD: - rmaps_lama_cmd_bind = strdup("1h"); - break; - case OPAL_BIND_TO_CPUSET: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - "by CPU set", "binding to CPU set not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - default: - rmaps_lama_cmd_bind = NULL; - break; - } - } - - /* - * Ordering (a.k.a. Ranking) Options - */ - if( NULL == rmaps_lama_cmd_ordering ) { - /* orte_rmaps_base.ranking */ - switch( ORTE_GET_RANKING_POLICY(jdata->map->ranking) ) { - case ORTE_RANK_BY_SLOT: - rmaps_lama_cmd_ordering = strdup("s"); - break; - case ORTE_RANK_BY_NODE: - case ORTE_RANK_BY_NUMA: - case ORTE_RANK_BY_SOCKET: - case ORTE_RANK_BY_L3CACHE: - case ORTE_RANK_BY_L2CACHE: - case ORTE_RANK_BY_L1CACHE: - case ORTE_RANK_BY_CORE: - case ORTE_RANK_BY_HWTHREAD: - rmaps_lama_cmd_ordering = strdup("n"); - break; - case ORTE_RANK_BY_BOARD: - /* rmaps_lama_cmd_ordering = strdup("n"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid ordering option", - true, - "by board", "ordering by board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - default: - rmaps_lama_cmd_ordering = strdup("n"); - break; - } - } - - /* - * MPPR - */ - if( NULL == rmaps_lama_cmd_mppr ) { - /* - * The ppr is given in the map - */ - if( NULL != jdata->map->ppr) { - rmaps_lama_cmd_mppr = rmaps_lama_covert_ppr(jdata->map->ppr); - } - } - - /* - * Oversubscription - */ - if( ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping) ) { - rmaps_lama_can_oversubscribe = false; - } - else { - rmaps_lama_can_oversubscribe = true; - } - - /* - * Display revised values - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Revised Parameters -----"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Map : %s", - rmaps_lama_cmd_map); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bind : %s", - rmaps_lama_cmd_bind); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: MPPR : %s", - rmaps_lama_cmd_mppr); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Order : %s", - rmaps_lama_cmd_ordering); - - cleanup: - return exit_status; -} - -static char * rmaps_lama_covert_ppr(char * given_ppr) -{ - return strdup(given_ppr); -} - -int rmaps_lama_parse_mapping(char *layout, - rmaps_lama_level_type_t **layout_types, - rmaps_lama_level_type_t **layout_types_sorted, - int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - int i, j, len; - bool found_req_param_n = false; - bool found_req_param_h = false; - bool found_req_param_bind = false; - - /* - * Sanity Check: - * There is no default layout, so if we get here and nothing is specified - * then this is an error. - */ - if( NULL == layout ) { - orte_show_help("help-orte-rmaps-lama.txt", - "internal error", - true, - "rmaps_lama_parse_mapping", - "internal error 1"); - return ORTE_ERROR; - } - - *num_types = 0; - - /* - * Extract and convert all the keys - */ - len = strlen(layout); - for(i = 0; i < len; ++i) { - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( layout[i] == 'L' ) { - param[0] = layout[i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, "cache level missing number"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = layout[i]; - param[2] = '\0'; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = layout[i]; - param[1] = '\0'; - } - - /* - * Append level - */ - *num_types += 1; - *layout_types = (rmaps_lama_level_type_t*)realloc(*layout_types, sizeof(rmaps_lama_level_type_t) * (*num_types)); - (*layout_types)[(*num_types)-1] = lama_type_str_to_enum(param); - } - - /* - * Check for duplicates and unknowns - * Copy to sorted list - */ - *layout_types_sorted = (rmaps_lama_level_type_t*)malloc(sizeof(rmaps_lama_level_type_t) * (*num_types)); - for( i = 0; i < *num_types; ++i ) { - /* - * Copy for later sorting - */ - (*layout_types_sorted)[i] = (*layout_types)[i]; - - /* - * Look for unknown and unsupported options - */ - if( LAMA_LEVEL_UNKNOWN <= (*layout_types)[i] ) { - char *msg; - asprintf(&msg, "unknown mapping level at position %d", i + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - if( LAMA_LEVEL_MACHINE == (*layout_types)[i] ) { - found_req_param_n = true; - } - - if( LAMA_LEVEL_PU == (*layout_types)[i] ) { - found_req_param_h = true; - } - - if( lama_binding_level == (*layout_types)[i] ) { - found_req_param_bind = true; - } - - /* - * Look for duplicates - */ - for( j = i+1; j < *num_types; ++j ) { - if( (*layout_types)[i] == (*layout_types)[j] ) { - char *msg; - asprintf(&msg, "duplicate mapping levels at position %d and %d", - i + 1, j + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - } - - /* - * The user is required to specify at least the: - * - machine - * - hardware thread (needed for lower bound binding) JJH: We should be able to lift this... - * - binding layer (need it to stride the mapping) - * Only print the error message once, for brevity. - */ - if( !found_req_param_n ) { - char *msg; - asprintf(&msg, "missing required 'n' mapping token"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - else if(!found_req_param_h) { - char *msg; - asprintf(&msg, "missing required 'h' mapping token"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } else if (!found_req_param_bind) { - char *msg; - asprintf(&msg, "missing required mapping token for the current binding level"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Sort the items - */ - qsort((*layout_types_sorted ), (*num_types), sizeof(int), lama_parse_int_sort); - - cleanup: - return exit_status; -} - -int rmaps_lama_parse_binding(char *layout, rmaps_lama_level_type_t *binding_level, int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - char num[MAX_BIND_DIGIT_LEN]; - int i, n, p, len; - - /* - * Default: If nothing specified - * - Bind to machine - */ - if( NULL == layout ) { - *binding_level = LAMA_LEVEL_MACHINE; - *num_types = 1; - return ORTE_SUCCESS; - } - - *num_types = 0; - - /* - * Extract and convert all the keys - */ - len = strlen(layout); - n = 0; - p = 0; - for(i = 0; i < len; ++i) { - /* - * Must start with a digit - */ - if( isdigit(layout[i]) ) { - /* - * Check: Digits must come first - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "missing digit(s) before binding level token"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - num[n] = layout[i]; - ++n; - /* - * Check: Exceed bound of number of digits - */ - if( n >= MAX_BIND_DIGIT_LEN ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "too many digits"); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - /* - * Extract the level - */ - else { - /* - * Check: Digits must come first - */ - if( n == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "missing digit(s) before binding level token"); - exit_status = ORTE_ERROR; - goto cleanup; - } - /* - * Check: Only one level allowed - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "only one binding level may be specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( layout[i] == 'L' ) { - param[0] = layout[i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "only one binding level may be specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = layout[i]; - p = 2; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = layout[i]; - p = 1; - } - param[p] = '\0'; - } - } - /* - * Check that the level was specified - */ - if( p == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "binding specification is empty"); - exit_status = ORTE_ERROR; - goto cleanup; - } - num[n] = '\0'; - - *binding_level = lama_type_str_to_enum(param); - *num_types = atoi(num); - - /* - * Check for unknown level - */ - if( LAMA_LEVEL_UNKNOWN <= *binding_level ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "unknown binding level"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - cleanup: - return exit_status; -} - -int rmaps_lama_parse_mppr(char *layout, rmaps_lama_level_info_t **mppr_levels, int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - char num[MAX_BIND_DIGIT_LEN]; - char **argv = NULL; - int argc = 0; - int i, j, len; - int p, n; - - /* - * Default: Unrestricted allocation - * 'oversubscribe' flag accounted for elsewhere - */ - if( NULL == layout ) { - *mppr_levels = NULL; - *num_types = 0; - return ORTE_SUCCESS; - } - - *num_types = 0; - - /* - * Split by ',' - * <#:level>,<#:level>,... - */ - argv = opal_argv_split(layout, ','); - argc = opal_argv_count(argv); - for(j = 0; j < argc; ++j) { - /* - * Parse <#:level> - */ - len = strlen(argv[j]); - n = 0; - p = 0; - for(i = 0; i < len; ++i) { - /* - * Skip the ':' separator and whitespace - */ - if( argv[j][i] == ':' || isblank(argv[j][i])) { - continue; - } - /* - * Must start with a digit - */ - else if( isdigit(argv[j][i]) ) { - /* - * Check: Digits must come first - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "missing digit(s) before resource specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - num[n] = argv[j][i]; - ++n; - /* - * Check: Exceed bound of number of digits - */ - if( n >= MAX_BIND_DIGIT_LEN ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "too many digits"); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - /* - * Extract the level - */ - else { - /* - * Check: Digits must come first - */ - if( n == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "missing digit(s) before resource specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - /* - * Check: Only one level allowed - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "only one resource type may be listed per specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( argv[j][i] == 'L' ) { - param[0] = argv[j][i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "cache level missing number"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = argv[j][i]; - p = 2; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = argv[j][i]; - p = 1; - } - param[p] = '\0'; - } - } - - /* - * Whitespace, just skip - */ - if( n == 0 && p == 0 ) { - continue; - } - - /* - * Check that the level was specified - */ - if( p == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "resource type not specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - num[n] = '\0'; - - /* - * Append level - */ - *num_types += 1; - *mppr_levels = (rmaps_lama_level_info_t*)realloc(*mppr_levels, sizeof(rmaps_lama_level_info_t) * (*num_types)); - (*mppr_levels)[(*num_types)-1].type = lama_type_str_to_enum(param); - (*mppr_levels)[(*num_types)-1].max_resources = atoi(num); - - } - - /* - * Check for duplicates and unknowns - */ - for( i = 0; i < *num_types; ++i ) { - /* - * Look for unknown and unsupported options - */ - if( LAMA_LEVEL_UNKNOWN <= (*mppr_levels)[i].type ) { - char *msg; - asprintf(&msg, "unknown resource type at position %d", i + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Look for duplicates - */ - for( j = i+1; j < *num_types; ++j ) { - if( (*mppr_levels)[i].type == (*mppr_levels)[j].type ) { - char *msg; - asprintf(&msg, "duplicate resource tpyes at position %d and %d", - i + 1, j + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - } - - cleanup: - if( NULL != argv ) { - opal_argv_free(argv); - argv = NULL; - } - - return exit_status; -} - -int rmaps_lama_parse_ordering(char *layout, - rmaps_lama_order_type_t *order) -{ - /* - * Default: Natural ordering - */ - if( NULL == layout ) { - *order = LAMA_ORDER_NATURAL; - return ORTE_SUCCESS; - } - - /* - * Sequential Ordering - */ - if( 0 == strncmp(layout, "s", strlen("s")) || - 0 == strncmp(layout, "S", strlen("S")) ) { - *order = LAMA_ORDER_SEQ; - } - /* - * Natural Ordering - */ - else if( 0 == strncmp(layout, "n", strlen("n")) || - 0 == strncmp(layout, "N", strlen("N")) ) { - *order = LAMA_ORDER_NATURAL; - } - /* - * Check for unknown options - */ - else { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid ordering option", - true, - "unsupported ordering option", layout); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -bool rmaps_lama_ok_to_prune_level(rmaps_lama_level_type_t level) -{ - int i; - - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - if( level == lama_mapping_layout[i] ) { - return false; - } - } - - return true; -} - -/********************************* - * Support Functions - *********************************/ -static int lama_parse_int_sort(const void *a, const void *b) { - int left = *((int*)a); - int right = *((int*)b); - - if( left < right ) { - return -1; - } - else if( left > right ) { - return 1; - } - else { - return 0; - } -} diff --git a/orte/mca/rmaps/mindist/rmaps_mindist_module.c b/orte/mca/rmaps/mindist/rmaps_mindist_module.c index 53ce91f71ae..29d5e7813b5 100644 --- a/orte/mca/rmaps/mindist/rmaps_mindist_module.c +++ b/orte/mca/rmaps/mindist/rmaps_mindist_module.c @@ -45,7 +45,7 @@ static int mindist_map(orte_job_t *jdata); orte_rmaps_base_module_t orte_rmaps_mindist_module = { - mindist_map + .map_job = mindist_map }; /* @@ -391,15 +391,6 @@ static int mindist_map(orte_job_t *jdata) } } - /* compute vpids and add proc objects to the job - do this after - * each app_context so that the ranks within each context are - * contiguous - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track the total number of processes we mapped - must update * this value AFTER we compute vpids so that computation * is done correctly @@ -415,6 +406,17 @@ static int mindist_map(orte_job_t *jdata) OBJ_DESTRUCT(&node_list); } free(orte_rmaps_base.device); + /* compute vpids and add proc objects to the job - do this after + * each app_context so that the ranks within each context are + * contiguous + */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* mark the job as fully described */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); return ORTE_SUCCESS; error: @@ -425,3 +427,96 @@ static int mindist_map(orte_job_t *jdata) return rc; } + +#if 0 +static int assign_locations(orte_job_t *jdata) +{ + int j, k, m, n, npus; + orte_app_context_t *app; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + mca_base_component_t *c = &mca_rmaps_mindist_component.base_version; + int rc; + opal_list_t numa_list; + opal_rmaps_numa_node_t *numa; + + if (NULL == jdata->map->last_mapper|| + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* the mapper should have been set to me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:mindist: job %s not using mindist mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:mindist: assign locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* start assigning procs to objects, filling each object as we go until + * all procs are assigned. If one pass doesn't catch all the required procs, + * then loop thru the list again to handle the oversubscription + */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + + /* first we need to fill summary object for root with information about nodes + * so we call opal_hwloc_base_get_nbobjs_by_type */ + opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); + OBJ_CONSTRUCT(&numa_list, opal_list_t); + rc = opal_hwloc_get_sorted_numa_list(node->topology->topo, orte_rmaps_base.device, &numa_list); + if (rc > 1) { + orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-devices", + true, orte_rmaps_base.device, rc, node->name); + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_TAKE_NEXT_OPTION; + } else if (rc < 0) { + orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:device-not-found", + true, orte_rmaps_base.device, node->name); + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + j = 0; + OPAL_LIST_FOREACH(numa, &numa_list, opal_rmaps_numa_node_t) { + /* get the hwloc object for this numa */ + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_NOT_FOUND; + } + npus = opal_hwloc_base_get_npus(node->topology->topo, obj); + /* fill the numa region with procs from this job until we either + * have assigned everyone or the region is full */ + for (k = j; k < node->procs->size && 0 < npus; k++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + ++j; + --npus; + } + } + OPAL_LIST_DESTRUCT(&numa_list); + } + } + + return ORTE_SUCCESS; +} +#endif diff --git a/orte/mca/rmaps/ppr/rmaps_ppr.c b/orte/mca/rmaps/ppr/rmaps_ppr.c index 35285e95cda..41523de3b6b 100644 --- a/orte/mca/rmaps/ppr/rmaps_ppr.c +++ b/orte/mca/rmaps/ppr/rmaps_ppr.c @@ -33,9 +33,11 @@ #include "rmaps_ppr.h" static int ppr_mapper(orte_job_t *jdata); +static int assign_locations(orte_job_t *jdata); orte_rmaps_base_module_t orte_rmaps_ppr_module = { - ppr_mapper + .map_job = ppr_mapper, + .assign_locations = assign_locations }; /* RHC: will eventually remove this @@ -391,11 +393,6 @@ static int ppr_mapper(orte_job_t *jdata) rc = ORTE_ERR_SILENT; goto error; } - /* compute vpids and add proc objects to the job */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - goto error; - } /* track the total number of processes we mapped - must update * this AFTER we compute vpids so that computation is done @@ -623,3 +620,122 @@ static void prune(orte_jobid_t jobid, error: opal_output(0, "INFINITE LOOP"); } + +static int assign_locations(orte_job_t *jdata) +{ + int i, j, m, n; + mca_base_component_t *c=&mca_rmaps_ppr_component.base_version; + orte_node_t *node; + orte_proc_t *proc; + orte_app_context_t *app; + opal_hwloc_level_t level; + hwloc_obj_t obj; + unsigned int cache_level=0; + int ppr, cnt, nobjs, nprocs_mapped; + char **ppr_req, **ck; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:ppr: job %s not using ppr assign: %s", + ORTE_JOBID_PRINT(jdata->jobid), + (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:ppr: assigning locations for job %s with ppr %s policy %s", + ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr, + orte_rmaps_base_print_mapping(jdata->map->mapping)); + + /* pickup the object level */ + if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_NODE_LEVEL; + } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_HWTHREAD_LEVEL; + } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_CORE_LEVEL; + } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_SOCKET_LEVEL; + } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L1CACHE_LEVEL; + cache_level = 1; + } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L2CACHE_LEVEL; + cache_level = 2; + } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L3CACHE_LEVEL; + cache_level = 3; + } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_NUMA_LEVEL; + } else { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + /* get the ppr value */ + ppr_req = opal_argv_split(jdata->map->ppr, ','); + ck = opal_argv_split(ppr_req[0], ':'); + ppr = strtol(ck[0], NULL, 10); + opal_argv_free(ck); + opal_argv_free(ppr_req); + + /* start assigning procs to objects, filling each object as we go until + * all procs are assigned. */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + nprocs_mapped = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + if (OPAL_HWLOC_NODE_LEVEL == level) { + obj = hwloc_get_root_obj(node->topology->topo); + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } else { + /* get the number of resources on this node at this level */ + nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, + level, cache_level, + OPAL_HWLOC_AVAILABLE); + + /* map the specified number of procs to each such resource on this node, + * recording the locale of each proc so we know its cpuset + */ + cnt = 0; + for (i=0; i < nobjs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, + level, cache_level, + i, OPAL_HWLOC_AVAILABLE); + for (j=0; j < node->procs->size && cnt < ppr && nprocs_mapped < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + nprocs_mapped++; + cnt++; + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + } + } + } + return ORTE_SUCCESS; +} diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.c b/orte/mca/rmaps/rank_file/rmaps_rank_file.c index 26d19f6881e..ee8651d5b2b 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.c @@ -51,6 +51,13 @@ #include "orte/mca/rmaps/rank_file/rmaps_rank_file_lex.h" #include "orte/runtime/orte_globals.h" +static int orte_rmaps_rf_map(orte_job_t *jdata); + +orte_rmaps_base_module_t orte_rmaps_rank_file_module = { + .map_job = orte_rmaps_rf_map +}; + + static int orte_rmaps_rank_file_parse(const char *); static char *orte_rmaps_rank_file_parse_string_or_int(void); static const char *orte_rmaps_rank_file_name_cur = NULL; @@ -363,6 +370,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) } } OBJ_DESTRUCT(&rankmap); + /* mark the job as fully described */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + return rc; error: @@ -371,11 +381,6 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) return rc; } -orte_rmaps_base_module_t orte_rmaps_rank_file_module = { -orte_rmaps_rf_map -}; - - static int orte_rmaps_rank_file_parse(const char *rankfile) { int token; diff --git a/orte/mca/rmaps/resilient/rmaps_resilient.c b/orte/mca/rmaps/resilient/rmaps_resilient.c index afc4576737b..3ead4d31305 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient.c @@ -5,7 +5,7 @@ * Corporation. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * * $COPYRIGHT$ * @@ -36,6 +36,14 @@ #include "orte/mca/rmaps/base/base.h" #include "rmaps_resilient.h" +static int orte_rmaps_resilient_map(orte_job_t *jdata); +static int resilient_assign(orte_job_t *jdata); + +orte_rmaps_base_module_t orte_rmaps_resilient_module = { + .map_job = orte_rmaps_resilient_map, + .assign_locations = resilient_assign +}; + /* * Local variable @@ -270,9 +278,22 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) return rc; } -orte_rmaps_base_module_t orte_rmaps_resilient_module = { - orte_rmaps_resilient_map -}; +static int resilient_assign(orte_job_t *jdata) +{ + mca_base_component_t *c = &mca_rmaps_resilient_component.super.base_version; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:resilient: job %s not using resilient assign: %s", + ORTE_JOBID_PRINT(jdata->jobid), + (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + return ORTE_ERR_NOT_IMPLEMENTED; +} static char *orte_getline(FILE *fp) { @@ -855,15 +876,6 @@ static int map_to_ftgrps(orte_job_t *jdata) /* track number of procs */ jdata->num_procs += app->num_procs; - /* compute vpids and add proc objects to the job - this has to be - * done after each app_context is mapped in order to keep the - * vpids contiguous within an app_context - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* cleanup the node list - it can differ from one app_context * to another, so we have to get it every time */ @@ -873,11 +885,5 @@ static int map_to_ftgrps(orte_job_t *jdata) OBJ_DESTRUCT(&node_list); } - /* compute and save local ranks */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return ORTE_SUCCESS; } diff --git a/orte/mca/rmaps/rmaps.h b/orte/mca/rmaps/rmaps.h index 9670c7ac2c8..4faaf2e2cb8 100644 --- a/orte/mca/rmaps/rmaps.h +++ b/orte/mca/rmaps/rmaps.h @@ -60,23 +60,30 @@ BEGIN_C_DECLS * rmaps module functions */ -/* mapping event - the event one activates to schedule mapping - * of procs to nodes for pending jobs - */ -ORTE_DECLSPEC extern opal_event_t orte_mapping_event; - /** * RMAPS module functions - these are not accessible to the outside world, * but are defined here by convention */ + +/* map a job - used by the HNP to compute the #procs on each node. + * This is passed to the backend daemons as a regex which they + * use to create an orte_job_map_t for the job */ typedef int (*orte_rmaps_base_module_map_fn_t)(orte_job_t *jdata); +/* assign a location to each process. Used by the backend daemons, + * this function takes the orte_job_map_t created from the regex + * and assigns each process to a specific location within the + * hardware topology based on the --map-by directive */ +typedef int (*orte_rmaps_base_module_assign_loc_fn_t)(orte_job_t *jdata); + /* * rmaps module version 3.0.0 */ struct orte_rmaps_base_module_3_0_0_t { /** Mapping function pointer */ orte_rmaps_base_module_map_fn_t map_job; + /* assign locations */ + orte_rmaps_base_module_assign_loc_fn_t assign_locations; }; /** Convenience typedef */ typedef struct orte_rmaps_base_module_3_0_0_t orte_rmaps_base_module_3_0_0_t; diff --git a/orte/mca/rmaps/round_robin/Makefile.am b/orte/mca/rmaps/round_robin/Makefile.am index 1f19dcc7657..bd51a226429 100644 --- a/orte/mca/rmaps/round_robin/Makefile.am +++ b/orte/mca/rmaps/round_robin/Makefile.am @@ -10,6 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -23,7 +24,8 @@ sources = \ rmaps_rr.c \ rmaps_rr.h \ rmaps_rr_component.c \ - rmaps_rr_mappers.c + rmaps_rr_mappers.c \ + rmaps_rr_assign.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index a764e0243f3..b268c4953e7 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -243,15 +243,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) goto error; } - /* compute vpids and add proc objects to the job - do this after - * each app_context so that the ranks within each context are - * contiguous - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track the total number of processes we mapped - must update * this value AFTER we compute vpids so that computation * is done correctly @@ -278,6 +269,113 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) return rc; } +static int orte_rmaps_rr_assign_locations(orte_job_t *jdata) +{ + mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version; + int rc; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: job %s not using rr mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assign locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* if the mapping directive was byslot or bynode, then we + * assign locations to the root object level */ + if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping) || + ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + return orte_rmaps_rr_assign_root_level(jdata); + } + + /* otherwise, assign by object */ + if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_PU, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't assign by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CORE, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 1); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 2); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 3); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_SOCKET, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_NODE, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else { + /* unrecognized mapping directive */ + orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", + true, "mapping", + orte_rmaps_base_print_mapping(jdata->map->mapping)); + rc = ORTE_ERR_SILENT; + } + return rc; +} + orte_rmaps_base_module_t orte_rmaps_round_robin_module = { - orte_rmaps_rr_map + .map_job = orte_rmaps_rr_map, + .assign_locations = orte_rmaps_rr_assign_locations }; diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.h b/orte/mca/rmaps/round_robin/rmaps_rr.h index 6591a3b6c20..4d998bbbba1 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.h +++ b/orte/mca/rmaps/round_robin/rmaps_rr.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Cisco Systems, Inc. All rights reserved * $COPYRIGHT$ * @@ -54,6 +54,13 @@ ORTE_MODULE_DECLSPEC int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_app_context orte_vpid_t num_procs, hwloc_obj_type_t target, unsigned cache_level); +ORTE_MODULE_DECLSPEC int orte_rmaps_rr_assign_root_level(orte_job_t *jdata); + +ORTE_MODULE_DECLSPEC int orte_rmaps_rr_assign_byobj(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level); + + END_C_DECLS #endif diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_assign.c b/orte/mca/rmaps/round_robin/rmaps_rr_assign.c new file mode 100644 index 00000000000..81fa0b67b08 --- /dev/null +++ b/orte/mca/rmaps/round_robin/rmaps_rr_assign.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include + +#include "opal/util/output.h" +#include "opal/mca/hwloc/base/base.h" + +#include "orte/util/show_help.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/rmaps/base/rmaps_private.h" +#include "orte/mca/rmaps/base/base.h" +#include "rmaps_rr.h" + +int orte_rmaps_rr_assign_root_level(orte_job_t *jdata) +{ + int i, m; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning procs to root level for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:slot working node %s", + node->name); + /* get the root object as we are not assigning + * locale here except at the node level */ + if (NULL == node->topology || NULL == node->topology->topo) { + /* nothing we can do */ + continue; + } + obj = hwloc_get_root_obj(node->topology->topo); + for (i=0; i < node->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:assign skipping proc %s - from another job", + ORTE_NAME_PRINT(&proc->name)); + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + return ORTE_SUCCESS; +} + +/* mapping by hwloc object looks a lot like mapping by node, + * but has the added complication of possibly having different + * numbers of objects on each node + */ +int orte_rmaps_rr_assign_byobj(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level) +{ + int start, j, m, n; + orte_app_context_t *app; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + unsigned int nobjs; + + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning locations by %s for job %s", + hwloc_obj_type_string(target), + ORTE_JOBID_PRINT(jdata->jobid)); + + + /* start mapping procs onto objects, filling each object as we go until + * all procs are mapped. If one pass doesn't catch all the required procs, + * then loop thru the list again to handle the oversubscription + */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + /* get the number of objects of this type on this node */ + nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE); + if (0 == nobjs) { + continue; + } + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: found %u %s objects on node %s", + nobjs, hwloc_obj_type_string(target), node->name); + + /* if this is a comm_spawn situation, start with the object + * where the parent left off and increment */ + if (ORTE_JOBID_INVALID != jdata->originator.jobid) { + start = (jdata->bkmark_obj + 1) % nobjs; + } else { + start = 0; + } + /* loop over the procs on this node */ + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:assign skipping proc %s - from another job", + ORTE_NAME_PRINT(&proc->name)); + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + opal_output_verbose(20, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning proc to object %d", (j + start) % nobjs); + /* get the hwloc object */ + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (j + start) % nobjs, OPAL_HWLOC_AVAILABLE))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) { + orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true, + orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj), + orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); + return ORTE_ERR_SILENT; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/rmaps/seq/rmaps_seq.c b/orte/mca/rmaps/seq/rmaps_seq.c index 623a2184f59..9bbe2253964 100644 --- a/orte/mca/rmaps/seq/rmaps_seq.c +++ b/orte/mca/rmaps/seq/rmaps_seq.c @@ -54,7 +54,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata); /* define the module */ orte_rmaps_base_module_t orte_rmaps_seq_module = { - orte_rmaps_seq_map + .map_job = orte_rmaps_seq_map }; /* local object for tracking rank locations */ @@ -517,6 +517,10 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) } } + /* mark that this job is to be fully + * described in the launch msg */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + return ORTE_SUCCESS; error: diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index 69cfa8945a8..38c27ba08a2 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -899,8 +899,6 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); - /* flag that the node is no longer in a map */ - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; diff --git a/orte/mca/state/dvm/state_dvm.c b/orte/mca/state/dvm/state_dvm.c index 6fcecd26bee..d095813594f 100644 --- a/orte/mca/state/dvm/state_dvm.c +++ b/orte/mca/state/dvm/state_dvm.c @@ -255,7 +255,7 @@ static void vm_ready(int fd, short args, void *cbdata) /* if we couldn't provide the allocation regex on the orted * cmd line, then we need to provide all the info here */ if (!orte_nidmap_communicated) { - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &nidmap))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; diff --git a/orte/mca/state/hnp/state_hnp.c b/orte/mca/state/hnp/state_hnp.c index c18c4a0e01a..cfde6135390 100644 --- a/orte/mca/state/hnp/state_hnp.c +++ b/orte/mca/state/hnp/state_hnp.c @@ -73,6 +73,8 @@ static orte_job_state_t launch_states[] = { ORTE_JOB_STATE_DAEMONS_LAUNCHED, ORTE_JOB_STATE_DAEMONS_REPORTED, ORTE_JOB_STATE_VM_READY, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_MAP_COMPLETE, ORTE_JOB_STATE_SYSTEM_PREP, ORTE_JOB_STATE_LAUNCH_APPS, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, @@ -91,6 +93,8 @@ static orte_state_cbfunc_t launch_callbacks[] = { orte_plm_base_daemons_launched, orte_plm_base_daemons_reported, orte_plm_base_vm_ready, + orte_rmaps_base_map_job, + orte_plm_base_mapping_complete, orte_plm_base_complete_setup, orte_plm_base_launch_apps, orte_state_base_local_launch_complete, diff --git a/orte/mca/state/novm/state_novm.c b/orte/mca/state/novm/state_novm.c index 512f6cc43dd..72d7c0bd397 100644 --- a/orte/mca/state/novm/state_novm.c +++ b/orte/mca/state/novm/state_novm.c @@ -61,6 +61,7 @@ orte_state_base_module_t orte_state_novm_module = { }; static void allocation_complete(int fd, short args, void *cbdata); +static void map_complete(int fd, short args, void *cbdata); static void vm_ready(int fd, short args, void *cbdata); /* defined state machine sequence for no VM - individual @@ -74,6 +75,8 @@ static orte_job_state_t launch_states[] = { ORTE_JOB_STATE_DAEMONS_LAUNCHED, ORTE_JOB_STATE_DAEMONS_REPORTED, ORTE_JOB_STATE_VM_READY, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_MAP_COMPLETE, ORTE_JOB_STATE_SYSTEM_PREP, ORTE_JOB_STATE_LAUNCH_APPS, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, @@ -93,6 +96,8 @@ static orte_state_cbfunc_t launch_callbacks[] = { orte_plm_base_daemons_launched, orte_plm_base_daemons_reported, vm_ready, + orte_rmaps_base_map_job, + map_complete, orte_plm_base_complete_setup, orte_plm_base_launch_apps, orte_state_base_local_launch_complete, @@ -195,7 +200,7 @@ static void allocation_complete(int fd, short args, void *cbdata) orte_job_t *daemons; orte_topology_t *t; orte_node_t *node; - int i, rc; + int i; jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE; @@ -235,21 +240,27 @@ static void allocation_complete(int fd, short args, void *cbdata) } } - /* perform the map */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) { - ORTE_ERROR_LOG(rc); - ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); - goto done; - } - - /* after we map, we are ready to launch the daemons */ - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + /* move to the map stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); done: /* cleanup */ OBJ_RELEASE(state); } +/* after we map, we are ready to launch the daemons */ +static void map_complete(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = state->jdata; + + jdata->state = ORTE_JOB_STATE_MAP_COMPLETE; + /* move to the map stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + + /* cleanup */ + OBJ_RELEASE(state); +} static void vm_ready(int fd, short args, void *cbdata) { diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index 4dcb9cfb755..6b3e5bde785 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -99,6 +99,10 @@ int pmix_server_publish_fn(opal_process_name_t *proc, opal_pmix_persistence_t persist = OPAL_PMIX_PERSIST_APP; bool rset, pset; + opal_output_verbose(1, orte_pmix_server_globals.output, + "%s orted:pmix:server PUBLISH", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); (void)asprintf(&req->operation, "PUBLISH: %s:%d", __FILE__, __LINE__); @@ -259,6 +263,10 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys, /* pack the keys too */ for (i=0; i < nkeys; i++) { + opal_output_verbose(5, orte_pmix_server_globals.output, + "%s lookup data %s for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i], + ORTE_NAME_PRINT(proc)); if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &keys[i], 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(req); diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index c5914169198..04e434645f6 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -89,6 +89,53 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } + /* pack the attributes that need to be sent */ + count = 0; + OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { + if (ORTE_ATTR_GLOBAL == kv->local) { + ++count; + } + } + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { + if (ORTE_ATTR_GLOBAL == kv->local) { + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&kv, 1, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* check for job info attribute */ + cache = NULL; + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) && + NULL != cache) { + /* we need to pack these as well, but they are composed + * of opal_value_t's on a list. So first pack the number + * of list elements */ + count = opal_list_get_size(cache); + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* now pack each element on the list */ + OPAL_LIST_FOREACH(val, cache, opal_value_t) { + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&val, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } else { + /* pack a zero to indicate no job info is being passed */ + count = 0; + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* pack the personality */ count = opal_argv_count(jobs[i]->personality); if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &count, 1, OPAL_INT32))) { @@ -134,14 +181,18 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } - if (orte_no_vm && 0 < jobs[i]->num_procs) { - for (j=0; j < jobs[i]->procs->size; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { - continue; - } - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&proc, 1, ORTE_PROC))) { - ORTE_ERROR_LOG(rc); - return rc; + if (0 < jobs[i]->num_procs) { + /* check attributes to see if this job is to be fully + * described in the launch msg */ + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + for (j=0; j < jobs[i]->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { + continue; + } + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&proc, 1, ORTE_PROC))) { + ORTE_ERROR_LOG(rc); + return rc; + } } } } @@ -198,53 +249,6 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, ORTE_ERROR_LOG(rc); return rc; } - - /* pack the attributes that need to be sent */ - count = 0; - OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { - if (ORTE_ATTR_GLOBAL == kv->local) { - ++count; - } - } - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { - if (ORTE_ATTR_GLOBAL == kv->local) { - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&kv, 1, ORTE_ATTRIBUTE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - /* check for job info attribute */ - cache = NULL; - if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) && - NULL != cache) { - /* we need to pack these as well, but they are composed - * of opal_value_t's on a list. So first pack the number - * of list elements */ - count = opal_list_get_size(cache); - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* now pack each element on the list */ - OPAL_LIST_FOREACH(val, cache, opal_value_t) { - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&val, 1, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } else { - /* pack a zero to indicate no job info is being passed */ - count = 0; - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } } return ORTE_SUCCESS; } @@ -594,7 +598,11 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src, ORTE_ERROR_LOG(rc); return rc; } - + /* pack the last mapper */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->last_mapper), 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } /* pack the policies */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->mapping), 1, ORTE_MAPPING_POLICY))) { ORTE_ERROR_LOG(rc); diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index 93df939c8fb..6e49c160520 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -95,6 +95,44 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } + /* unpack the attributes */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, + &n, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + for (k=0; k < count; k++) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv, + &n, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + kv->local = ORTE_ATTR_GLOBAL; // obviously not a local value + opal_list_append(&jobs[i]->attributes, &kv->super); + } + /* unpack any job info */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, + &n, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (0 < count){ + cache = OBJ_NEW(opal_list_t); + orte_set_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR); + for (k=0; k < count; k++) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &val, + &n, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_list_append(cache, &val->super); + } + } + /* unpack the personality */ n=1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &n, OPAL_INT32))) { @@ -147,16 +185,20 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } - if (orte_no_vm && 0 < jobs[i]->num_procs) { - orte_proc_t *proc; - for (j=0; j < jobs[i]->num_procs; j++) { - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - &proc, &n, ORTE_PROC))) { - ORTE_ERROR_LOG(rc); - return rc; + if (0 < jobs[i]->num_procs) { + /* check attributes to see if this job was fully + * described in the launch msg */ + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + orte_proc_t *proc; + for (j=0; j < jobs[i]->num_procs; j++) { + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + &proc, &n, ORTE_PROC))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_pointer_array_add(jobs[i]->procs, proc); } - opal_pointer_array_add(jobs[i]->procs, proc); } } @@ -204,44 +246,6 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, ORTE_ERROR_LOG(rc); return rc; } - - /* unpack the attributes */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, - &n, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - for (k=0; k < count; k++) { - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv, - &n, ORTE_ATTRIBUTE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - kv->local = ORTE_ATTR_GLOBAL; // obviously not a local value - opal_list_append(&jobs[i]->attributes, &kv->super); - } - /* unpack any job info */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, - &n, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 < count){ - cache = OBJ_NEW(opal_list_t); - orte_set_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR); - for (k=0; k < count; k++) { - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &val, - &n, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - opal_list_append(cache, &val->super); - } - } } return ORTE_SUCCESS; @@ -655,6 +659,14 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest, return rc; } + /* unpack the last mapper */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + &(maps[i]->last_mapper), &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* unpack the policies */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, diff --git a/orte/runtime/orte_data_server.c b/orte/runtime/orte_data_server.c index 807f13f5911..605b0acd077 100644 --- a/orte/runtime/orte_data_server.c +++ b/orte/runtime/orte_data_server.c @@ -12,7 +12,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2016 Los Alamos National Security, LLC. * All rights reserved - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -111,6 +111,8 @@ OBJ_CLASS_INSTANCE(orte_data_req_t, static opal_pointer_array_t orte_data_server_store; static opal_list_t pending; static bool initialized = false; +static int orte_data_server_output = -1; +static int orte_data_server_verbosity = -1; int orte_data_server_init(void) { @@ -121,6 +123,19 @@ int orte_data_server_init(void) } initialized = true; + /* register a verbosity */ + orte_data_server_verbosity = -1; + (void) mca_base_var_register ("orte", "orte", "data", "server_verbose", + "Debug verbosity for ORTE data server", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, + &orte_data_server_verbosity); + if (0 <= orte_data_server_verbosity) { + orte_data_server_output = opal_output_open(NULL); + opal_output_set_verbosity(orte_data_server_output, + orte_data_server_verbosity); + } + OBJ_CONSTRUCT(&orte_data_server_store, opal_pointer_array_t); if (ORTE_SUCCESS != (rc = opal_pointer_array_init(&orte_data_server_store, 1, @@ -180,7 +195,7 @@ void orte_data_server(int status, orte_process_name_t* sender, orte_data_req_t *req, *rqnext; orte_jobid_t jobid = ORTE_JOBID_INVALID; - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server got message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); @@ -218,7 +233,7 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: publishing data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&data->owner))); @@ -245,7 +260,7 @@ void orte_data_server(int status, orte_process_name_t* sender, data->uid = iptr->data.uint32; OBJ_RELEASE(iptr); } else { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: adding %s to data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -255,7 +270,7 @@ void orte_data_server(int status, orte_process_name_t* sender, data->index = opal_pointer_array_add(&orte_data_server_store, data); - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: checking for pending requests", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -276,7 +291,14 @@ void orte_data_server(int status, orte_process_name_t* sender, for (i=0; NULL != req->keys[i]; i++) { /* cycle thru the data keys for matches */ OPAL_LIST_FOREACH(iptr, &data->values, opal_value_t) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tCHECKING %s TO %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + iptr->key, req->keys[i])); if (0 == strcmp(iptr->key, req->keys[i])) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s data server: packaging return", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* found it - package it for return */ if (NULL == reply) { reply = OBJ_NEW(opal_buffer_t); @@ -296,7 +318,7 @@ void orte_data_server(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(rc); break; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: adding %s data from %s to response", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -309,7 +331,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } if (NULL != reply) { /* send it back to the requestor */ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: returning data to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&req->requestor))); @@ -326,11 +348,11 @@ void orte_data_server(int status, orte_process_name_t* sender, reply = NULL; /* if the persistence is "first_read", then delete this data */ if (OPAL_PMIX_PERSIST_FIRST_READ == data->persistence) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s NOT STORING DATA FROM %s AT INDEX %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&data->owner), data->index)); - opal_pointer_array_set_item(&orte_data_server_store, data->index, NULL); + ORTE_NAME_PRINT(&data->owner), data->index); + opal_pointer_array_set_item(&orte_data_server_store, data->index, NULL)); OBJ_RELEASE(data); goto release; } @@ -349,7 +371,7 @@ void orte_data_server(int status, orte_process_name_t* sender, break; case ORTE_PMIX_LOOKUP_CMD: - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: lookup data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); @@ -416,7 +438,7 @@ void orte_data_server(int status, orte_process_name_t* sender, /* cycle across the provided keys */ ret_packed = false; for (i=0; NULL != keys[i]; i++) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: looking for %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i])); /* cycle across the stored data, looking for a match */ @@ -428,6 +450,10 @@ void orte_data_server(int status, orte_process_name_t* sender, } /* for security reasons, can only access data posted by the same user id */ if (uid != data->uid) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tMISMATCH UID %u %u", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (unsigned)uid, (unsigned)data->uid)); continue; } /* if the published range is constrained to namespace, then only @@ -435,12 +461,17 @@ void orte_data_server(int status, orte_process_name_t* sender, * in the same namespace as the requestor */ if (OPAL_PMIX_RANGE_NAMESPACE == data->range) { if (jobid != data->owner.jobid) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tMISMATCH JOBID %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobid), + ORTE_JOBID_PRINT(data->owner.jobid))); continue; } } /* see if we have this key */ OPAL_LIST_FOREACH(iptr, &data->values, opal_value_t) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s COMPARING %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i], iptr->key)); @@ -461,7 +492,7 @@ void orte_data_server(int status, orte_process_name_t* sender, opal_argv_free(keys); goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: adding %s to data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -473,7 +504,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } } if (data_added && OPAL_PMIX_PERSIST_FIRST_READ == data->persistence) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s REMOVING DATA FROM %s AT INDEX %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&data->owner), data->index)); @@ -483,14 +514,14 @@ void orte_data_server(int status, orte_process_name_t* sender, } } if (!ret_packed) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: data not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we were told to wait for the data, then queue this up * for later processing */ if (wait) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: pushing request to wait", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(answer); @@ -510,7 +541,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } opal_argv_free(keys); - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: data found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto SEND_ANSWER; @@ -524,7 +555,7 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: unpublish data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&requestor))); @@ -629,7 +660,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } SEND_ERROR: - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: sending error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); @@ -646,5 +677,3 @@ void orte_data_server(int status, orte_process_name_t* sender, OBJ_RELEASE(answer); } } - - diff --git a/orte/test/mpi/Makefile b/orte/test/mpi/Makefile index 3a0074aa325..eb08bbc5fdf 100644 --- a/orte/test/mpi/Makefile +++ b/orte/test/mpi/Makefile @@ -1,4 +1,11 @@ -PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach +PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn \ + concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child \ + bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help \ + crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop \ + parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort \ + debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info \ + info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach xlib \ + no-disconnect all: $(PROGS) diff --git a/orte/test/mpi/no-disconnect.c b/orte/test/mpi/no-disconnect.c new file mode 100644 index 00000000000..9403b3ff345 --- /dev/null +++ b/orte/test/mpi/no-disconnect.c @@ -0,0 +1,210 @@ +/* Contributed by Marcia Cristina Cera + , + http://www.open-mpi.org/community/lists/users/2009/12/11540.php */ + +/* It was decided that the issue highlighted by this test will NOT be + fixed in the 1.3/1.4 series. It is already fixed in the 1.5 + series. Hence, if we detect Open MPI < v1.5, return 77/skip. */ +/* Turns out the hnp cannot handle concurrent MPI_Comm_spawns + as of Open MPI 1.7. However, we hope this feature will + work in 2.0. with the new state machine based orte. */ + +#include +#include +#include +#include +#include +#include + +#include + +#define NCHARS 30 +const int max_depth = 4; + +/* + * Here are some replacements for standard, blocking MPI + * functions. These replacements are "nice" and yield the + * CPU instead of spinning hard. The interfaces are the same. + * Just replace: + * MPI_Recv with nice_recv + * MPI_Send with nice_send + * MPI_Barrier with nice_barrier + */ + + +static int nice_send(void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm) { + /* Assume a standard (presumably short/eager) send suffices. */ + return MPI_Send(buf, count, datatype, dest, tag, comm); +} + + +static int nice_recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status) { + MPI_Request req; + int flag; + struct timespec dt; + + /* + * We're only interested in modest levels of oversubscription + * -- e.g., 2-4x more processes than physical processors. + * So, the sleep time only needs to be about 2-4x longer than + * a futile MPI_Test call. For a wide range of processors, + * something less than a millisecond should be sufficient. + * Excessive sleep times (e.g., 1 second) would degrade performance. + */ + dt.tv_sec = 0; + dt.tv_nsec = 100000; + + MPI_Irecv(buf, count, datatype, source, tag, comm, &req); + + MPI_Test(&req, &flag, status); + while ( ! flag ) { + nanosleep(&dt, NULL); + MPI_Test(&req, &flag, status); + } + return MPI_SUCCESS; +} + + +static void nice_barrier(MPI_Comm comm) { + int me, np, jump, buf = -1; + + MPI_Comm_rank(comm,&me); + MPI_Comm_size(comm,&np); + + /* fan in */ + for ( jump = 1; jump < np; jump <<= 1 ) { + if ( ( me & jump ) != 0 ) { + nice_send(&buf, 1, MPI_INT, me - jump, 343, comm); + break; + } else if ( me + jump < np ) { + nice_recv(&buf, 1, MPI_INT, me + jump, 343, comm, MPI_STATUS_IGNORE); + } + } + + /* fan out */ + if ( 0 != me ) { + nice_recv(&buf, 1, MPI_INT, me - jump, 344, comm, MPI_STATUS_IGNORE); + } + jump >>= 1; + for ( ; jump > 0; jump >>= 1 ) { + if ( me + jump < np ) { + nice_send(&buf, 1, MPI_INT, me + jump, 344, comm); + } + } +} + + +int main (int argc, char **argv) +{ + char bufs [NCHARS]; /* send buffer */ + char bufr[2][NCHARS]; /* recv buffers */ + MPI_Comm parent; + int level = 0, participate = 1; + struct utsname buf; + + /* If this is prior to OMPI v2.0, return 77/skip */ +#if defined(OPEN_MPI) + if (OMPI_MAJOR_VERSION < 2) { + printf("Skipping, because the orte cannot handle concurrent MPI_Comm_spawns\n"); + return 77; + } else { + printf("Verify that this test is truly working because conncurrent MPI_Comm_spawns" + " has not worked before.\n"); + } +#endif + + uname(&buf); + printf("I AM pid %d with level %d on %s\n", getpid(), (argc < 2)?0:atoi(argv[1]), buf.nodename); + + MPI_Init(&argc, &argv); + MPI_Comm_get_parent(&parent); + + if (MPI_COMM_NULL != parent) { + /* spawned processes get stuff from parent */ + level = atoi(argv[1]); + MPI_Recv(&bufr[0], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE, + MPI_ANY_TAG, parent, MPI_STATUS_IGNORE); + printf("Parent sent: %s\n", bufr[0]); + } else { + + /* original processes have to decide whether to participate */ + + /* In this test, each process launched by "mpirun -n " spawns a + * binary tree of processes. You end up with * ( 1 << max_depth ) + * processes altogether. For max_depth=4, this means 16*. There + * is potential here for heavy oversubscription, especially if in + * testing we launch tests with set to the number of available + * processors. This test tolerates oversubscription somewhat since + * it entails little inter-process synchronization. Nevertheless, + * we try to idle all but /4 of the original processes, using a + * minimum of at least two processes + */ + + int me, np; + + MPI_Comm_size(MPI_COMM_WORLD,&np); + MPI_Comm_rank(MPI_COMM_WORLD,&me); + + if ( np > 4 ) { + /* turn off all but every 4th process */ + if ( ( me & 3 ) != 0 ) participate = 0; + } else + if ( np > 2 ) { + /* turn off all but every 2nd process */ + if ( ( me & 1 ) != 0 ) participate = 0; + } + } + + /* all spawned processes and selected "root" processes participate */ + if ( participate ) { + printf("level = %d\n", level); + + /* prepare send buffer */ + sprintf(bufs,"level %d (pid:%d)", level, getpid()); + + /* spawn */ + if (level < max_depth) { + int i, nspawn = 2, errcodes[1]; + MPI_Request req[2]; + MPI_Comm comm[2]; + char argv1[NCHARS]; + char *args[2]; + + /* level 0 spawns only one process to mimic the original test */ + if ( level == 0 ) nspawn = 1; + + /* prepare command line arguments */ + snprintf(argv1, sizeof(argv1), "%d", level+1); + args[0] = argv1; + args[1] = NULL; + + /* spawn, with a message sent to and received from each child */ + for ( i = 0; i < nspawn; i++ ) { + MPI_Comm_spawn(argv[0], args, 1, MPI_INFO_NULL, 0, MPI_COMM_SELF, + &comm[i], errcodes); + MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, comm[i]); + MPI_Irecv(&bufr[i], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE, + MPI_ANY_TAG, comm[i], &req[i]); + } + + /* wait for messages from children and print them */ + MPI_Waitall(nspawn, req, MPI_STATUSES_IGNORE); + for ( i = 0; i < nspawn; i++ ) + printf("Child %d sent: %s\n", i, bufr[i]); + } + + /* send message back to parent */ + if (MPI_COMM_NULL != parent) { + MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, parent); + } + } + + /* non-participating processes wait at this barrier for their peers */ + /* (This barrier won't cost that many CPU cycles.) */ + if (MPI_COMM_NULL == parent) { + nice_barrier(MPI_COMM_WORLD); + } + + MPI_Finalize(); + return 0; +} diff --git a/orte/util/attr.c b/orte/util/attr.c index 1f447f4a87c..a2d6ed48a7d 100644 --- a/orte/util/attr.c +++ b/orte/util/attr.c @@ -286,6 +286,8 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key) return "ORTE_JOB_TRANSPORT_KEY"; case ORTE_JOB_INFO_CACHE: return "ORTE_JOB_INFO_CACHE"; + case ORTE_JOB_FULLY_DESCRIBED: + return "ORTE_JOB_FULLY_DESCRIBED"; case ORTE_PROC_NOBARRIER: return "PROC-NOBARRIER"; diff --git a/orte/util/attr.h b/orte/util/attr.h index 1b961030091..817581e38b6 100644 --- a/orte/util/attr.h +++ b/orte/util/attr.h @@ -143,6 +143,7 @@ typedef uint16_t orte_job_flags_t; #define ORTE_JOB_NOTIFY_COMPLETION (ORTE_JOB_START_KEY + 50) // bool - notify parent proc when spawned job terminates #define ORTE_JOB_TRANSPORT_KEY (ORTE_JOB_START_KEY + 51) // string - transport keys assigned to this job #define ORTE_JOB_INFO_CACHE (ORTE_JOB_START_KEY + 52) // opal_list_t - list of opal_value_t to be included in job_info +#define ORTE_JOB_FULLY_DESCRIBED (ORTE_JOB_START_KEY + 53) // bool - job is fully described in launch msg #define ORTE_JOB_MAX_KEY 300 diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 3b2ec9bdfeb..ca4948fcbca 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -198,7 +198,7 @@ int orte_util_build_daemon_nidmap(void) return rc; } -int orte_util_nidmap_create(char **regex) +int orte_util_nidmap_create(opal_pointer_array_t *pool, char **regex) { char *node; char prefix[ORTE_MAX_NODE_PREFIX]; @@ -217,8 +217,8 @@ int orte_util_nidmap_create(char **regex) OBJ_CONSTRUCT(&dvpids, opal_list_t); rng = NULL; - for (n=0; n < orte_node_pool->size; n++) { - if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { + for (n=0; n < pool->size; n++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(pool, n))) { continue; } /* if no daemon has been assigned, then this node is not being used */ @@ -1180,3 +1180,217 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) OPAL_LIST_DESTRUCT(&flgs); return rc; } + +typedef struct { + opal_list_item_t super; + int ctx; + int nprocs; + int cnt; +} orte_nidmap_regex_t; +static void nrcon(orte_nidmap_regex_t *p) +{ + p->ctx = 0; + p->nprocs = -1; + p->cnt = 0; +} +static OBJ_CLASS_INSTANCE(orte_nidmap_regex_t, + opal_list_item_t, + nrcon, NULL); + +/* since not every node is involved in a job, we have to create a + * regex that indicates the ppn for every node, marking those that + * are not involved. Since each daemon knows the entire + * node pool, we simply provide a ppn for every daemon, with a -1 + * to indicate that the node is empty for that job */ +int orte_util_nidmap_generate_ppn(orte_job_t *jdata, char **ppn) +{ + orte_nidmap_regex_t *prng, **actives; + opal_list_t *prk; + orte_node_t *nptr; + orte_proc_t *proc; + size_t n; + int *cnt, i, k; + char *tmp2, *ptmp, **cache = NULL; + + /* create an array of lists to handle the number of app_contexts in this job */ + prk = (opal_list_t*)malloc(jdata->num_apps * sizeof(opal_list_t)); + cnt = (int*)malloc(jdata->num_apps * sizeof(int)); + actives = (orte_nidmap_regex_t**)malloc(jdata->num_apps * sizeof(orte_nidmap_regex_t*)); + for (n=0; n < jdata->num_apps; n++) { + OBJ_CONSTRUCT(&prk[n], opal_list_t); + actives[n] = NULL; + } + + /* we provide a complete map in the regex, with an entry for every + * node in the pool */ + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + /* if a daemon has been assigned, then count how many procs + * for each app_context from the specified job are assigned to this node */ + memset(cnt, 0, jdata->num_apps * sizeof(int)); + if (NULL != nptr->daemon) { + for (k=0; k < nptr->procs->size; k++) { + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(nptr->procs, k))) { + if (proc->name.jobid == jdata->jobid) { + ++cnt[proc->app_idx]; + } + } + } + } + /* track the #procs on this node */ + for (n=0; n < jdata->num_apps; n++) { + if (NULL == actives[n]) { + /* just starting */ + actives[n] = OBJ_NEW(orte_nidmap_regex_t); + actives[n]->nprocs = cnt[n]; + actives[n]->cnt = 1; + opal_list_append(&prk[n], &actives[n]->super); + } else { + /* is this the next in line */ + if (cnt[n] == actives[n]->nprocs) { + actives[n]->cnt++; + } else { + /* need to start another range */ + actives[n] = OBJ_NEW(orte_nidmap_regex_t); + actives[n]->nprocs = cnt[n]; + actives[n]->cnt = 1; + opal_list_append(&prk[n], &actives[n]->super); + } + } + } + } + + /* construct the regex from the found ranges for each app_context */ + ptmp = NULL; + for (n=0; n < jdata->num_apps; n++) { + OPAL_LIST_FOREACH(prng, &prk[n], orte_nidmap_regex_t) { + if (1 < prng->cnt) { + if (NULL == ptmp) { + asprintf(&ptmp, "%u(%u)", prng->nprocs, prng->cnt); + } else { + asprintf(&tmp2, "%s,%u(%u)", ptmp, prng->nprocs, prng->cnt); + free(ptmp); + ptmp = tmp2; + } + } else { + if (NULL == ptmp) { + asprintf(&ptmp, "%u", prng->nprocs); + } else { + asprintf(&tmp2, "%s,%u", ptmp, prng->nprocs); + free(ptmp); + ptmp = tmp2; + } + } + } + OPAL_LIST_DESTRUCT(&prk[n]); // releases all the actives objects + opal_argv_append_nosize(&cache, ptmp); + free(ptmp); + ptmp = NULL; + } + free(prk); + free(cnt); + free(actives); + + *ppn = opal_argv_join(cache, '@'); + opal_argv_free(cache); + + return ORTE_SUCCESS; +} + +int orte_util_nidmap_parse_ppn(orte_job_t *jdata, char *regex) +{ + orte_node_t *node; + orte_proc_t *proc; + int n, k, m, cnt; + char **tmp, *ptr, **ppn; + orte_nidmap_regex_t *rng; + opal_list_t trk; + int rc = ORTE_SUCCESS; + + /* split the regex by app_context */ + tmp = opal_argv_split(regex, '@'); + + /* for each app_context, set the ppn */ + for (n=0; NULL != tmp[n]; n++) { + ppn = opal_argv_split(tmp[n], ','); + /* decompress the ppn */ + OBJ_CONSTRUCT(&trk, opal_list_t); + for (m=0; NULL != ppn[m]; m++) { + rng = OBJ_NEW(orte_nidmap_regex_t); + opal_list_append(&trk, &rng->super); + /* check for a count */ + if (NULL != (ptr = strchr(ppn[m], '('))) { + ppn[m][strlen(ppn[m])-1] = '\0'; // remove trailing paren + *ptr = '\0'; + ++ptr; + rng->cnt = strtoul(ptr, NULL, 10); + } else { + rng->cnt = 1; + } + /* convert the number */ + rng->nprocs = strtoul(ppn[m], NULL, 10); + } + opal_argv_free(ppn); + + /* cycle thru our node pool and add the indicated number of procs + * to each node */ + rng = (orte_nidmap_regex_t*)opal_list_get_first(&trk); + cnt = 0; + for (m=0; m < orte_node_pool->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, m))) { + continue; + } + /* see if it has any procs for this job and app_context */ + if (0 < rng->nprocs) { + /* add this node to the job map if it isn't already there */ + if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { + OBJ_RETAIN(node); + ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); + opal_pointer_array_add(jdata->map->nodes, node); + } + /* create a proc object for each one */ + for (k=0; k < rng->nprocs; k++) { + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = jdata->jobid; + /* leave the vpid undefined as this will be determined + * later when we do the overall ranking */ + proc->app_idx = n; + proc->parent = node->daemon->name.vpid; + OBJ_RETAIN(node); + proc->node = node; + /* flag the proc as ready for launch */ + proc->state = ORTE_PROC_STATE_INIT; + opal_pointer_array_add(node->procs, proc); + /* we will add the proc to the jdata array when we + * compute its rank */ + } + node->num_procs += rng->nprocs; + } + ++cnt; + if (rng->cnt <= cnt) { + rng = (orte_nidmap_regex_t*)opal_list_get_next(&rng->super); + if (NULL == rng) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + opal_argv_free(tmp); + rc = ORTE_ERR_NOT_FOUND; + goto complete; + } + cnt = 0; + } + } + OPAL_LIST_DESTRUCT(&trk); + } + opal_argv_free(tmp); + + complete: + /* reset any node map flags we used so the next job will start clean */ + for (n=0; n < jdata->map->nodes->size; n++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + } + + return rc; +} diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index 3acc29b9277..e8c6f59bc21 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -46,7 +46,7 @@ BEGIN_C_DECLS ORTE_DECLSPEC void orte_util_nidmap_init(void); -ORTE_DECLSPEC int orte_util_nidmap_create(char **regex); +ORTE_DECLSPEC int orte_util_nidmap_create(opal_pointer_array_t *pool, char **regex); ORTE_DECLSPEC int orte_util_nidmap_parse(char *regex); /* create a regular expression describing the nodes in the @@ -59,6 +59,12 @@ ORTE_DECLSPEC int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer); ORTE_DECLSPEC int orte_util_build_daemon_nidmap(void); +/* create a regular expression describing the ppn for a job */ +ORTE_DECLSPEC int orte_util_nidmap_generate_ppn(orte_job_t *jdata, char **ppn); + +/* decode the ppn */ +ORTE_DECLSPEC int orte_util_nidmap_parse_ppn(orte_job_t *jdata, char *ppn); + END_C_DECLS #endif From 372ae3441c26c485670cd7ad59de32e68490c889 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 26 May 2017 08:57:55 -0700 Subject: [PATCH 16/29] Fix ompi-server operations Signed-off-by: Ralph Castain (cherry picked from commit 8c2a06477c85ae617c256f71ae40c9f7340a7ee6) --- orte/mca/rml/base/rml_base_contact.c | 16 ++- orte/orted/help-orted.txt | 7 ++ orte/orted/pmix/pmix_server.c | 91 +------------- orte/orted/pmix/pmix_server_internal.h | 4 +- orte/orted/pmix/pmix_server_pub.c | 157 ++++++++++++++++++++----- 5 files changed, 150 insertions(+), 125 deletions(-) diff --git a/orte/mca/rml/base/rml_base_contact.c b/orte/mca/rml/base/rml_base_contact.c index 6bc41fe2283..6ee2f2c2c8b 100644 --- a/orte/mca/rml/base/rml_base_contact.c +++ b/orte/mca/rml/base/rml_base_contact.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -72,6 +72,7 @@ int orte_rml_base_get_contact_info(orte_jobid_t job, opal_buffer_t *data) int orte_rml_base_update_contact_info(opal_buffer_t* data) { orte_std_cntr_t cnt; + orte_process_name_t peer; orte_vpid_t num_procs; char *rml_uri; int rc; @@ -89,11 +90,18 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data) if (NULL != rml_uri) { /* set the contact info into the hash table */ orte_rml.set_contact_info(rml_uri); + /* if this was an update to my own job, then + * track how many procs were in the message */ + if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &peer, NULL))) { + ORTE_ERROR_LOG(rc); + free(rml_uri); + return rc; + } + if (peer.jobid == ORTE_PROC_MY_NAME->jobid) { + ++num_procs; + } free(rml_uri); } - - /* track how many procs were in the message */ - ++num_procs; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); diff --git a/orte/orted/help-orted.txt b/orte/orted/help-orted.txt index c89d4e10157..cec46c2d159 100644 --- a/orte/orted/help-orted.txt +++ b/orte/orted/help-orted.txt @@ -80,3 +80,10 @@ This is usually caused by a large job that encounters significant delays across the cluster when starting the application processes. Your job may terminate as a result of this problem. You may want to adjust the MCA parameter pmix_server_max_reqs and try again. +# +[noserver] +A publish/lookup server was provided, but we were unable to connect +to it - please check the connection info and ensure the server +is alive: + + Connection: %s diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index 63b4dbfdd39..0ed02ce6b74 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -296,94 +296,6 @@ int pmix_server_init(void) } OPAL_LIST_DESTRUCT(&info); - /* if the universal server wasn't specified, then we use - * our own HNP for that purpose */ - if (NULL == orte_pmix_server_globals.server_uri) { - orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP; - } else { - char *server; - opal_buffer_t buf; - if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) || - 0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) { - char input[1024], *filename; - FILE *fp; - - /* it is a file - get the filename */ - filename = strchr(orte_pmix_server_globals.server_uri, ':'); - if (NULL == filename) { - /* filename is not correctly formatted */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, - orte_basename, orte_pmix_server_globals.server_uri); - return ORTE_ERR_BAD_PARAM; - } - ++filename; /* space past the : */ - - if (0 >= strlen(filename)) { - /* they forgot to give us the name! */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, - orte_basename, orte_pmix_server_globals.server_uri); - return ORTE_ERR_BAD_PARAM; - } - - /* open the file and extract the uri */ - fp = fopen(filename, "r"); - if (NULL == fp) { /* can't find or read file! */ - orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, - orte_basename, orte_pmix_server_globals.server_uri); - return ORTE_ERR_BAD_PARAM; - } - if (NULL == fgets(input, 1024, fp)) { - /* something malformed about file */ - fclose(fp); - orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, - orte_basename, orte_pmix_server_globals.server_uri, - orte_basename); - return ORTE_ERR_BAD_PARAM; - } - fclose(fp); - input[strlen(input)-1] = '\0'; /* remove newline */ - server = strdup(input); - } else { - server = strdup(orte_pmix_server_globals.server_uri); - } - /* setup our route to the server */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.pack(&buf, &server, 1, OPAL_STRING); - if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { - ORTE_ERROR_LOG(rc); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - return rc; - } - OBJ_DESTRUCT(&buf); - /* parse the URI to get the server's name */ - if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(server, &orte_pmix_server_globals.server, NULL))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* check if we are to wait for the server to start - resolves - * a race condition that can occur when the server is run - * as a background job - e.g., in scripts - */ - if (orte_pmix_server_globals.wait_for_server) { - /* ping the server */ - struct timeval timeout; - timeout.tv_sec = orte_pmix_server_globals.timeout; - timeout.tv_usec = 0; - if (ORTE_SUCCESS != (rc = orte_rml.ping(orte_mgmt_conduit, server, &timeout))) { - /* try it one more time */ - if (ORTE_SUCCESS != (rc = orte_rml.ping(orte_mgmt_conduit, server, &timeout))) { - /* okay give up */ - orte_show_help("help-orterun.txt", "orterun:server-not-found", true, - orte_basename, server, - (long)orte_pmix_server_globals.timeout, - ORTE_ERROR_NAME(rc)); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - return rc; - } - } - } - } - return rc; } @@ -716,8 +628,9 @@ OBJ_CLASS_INSTANCE(orte_pmix_server_op_caddy_t, static void rqcon(pmix_server_req_t *p) { p->operation = NULL; - p->target = *ORTE_NAME_INVALID; + p->range = OPAL_PMIX_RANGE_SESSION; p->proxy = *ORTE_NAME_INVALID; + p->target = *ORTE_NAME_INVALID; p->timeout = orte_pmix_server_globals.timeout; p->jdata = NULL; OBJ_CONSTRUCT(&p->msg, opal_buffer_t); diff --git a/orte/orted/pmix/pmix_server_internal.h b/orte/orted/pmix/pmix_server_internal.h index 5712529b5c7..52460271884 100644 --- a/orte/orted/pmix/pmix_server_internal.h +++ b/orte/orted/pmix/pmix_server_internal.h @@ -67,8 +67,9 @@ int timeout; int room_num; int remote_room_num; + opal_pmix_data_range_t range; orte_process_name_t proxy; - opal_process_name_t target; + orte_process_name_t target; orte_job_t *jdata; opal_buffer_t msg; opal_pmix_op_cbfunc_t opcbfunc; @@ -255,6 +256,7 @@ typedef struct { bool wait_for_server; orte_process_name_t server; opal_list_t notifications; + bool pubsub_init; } pmix_server_globals_t; extern pmix_server_globals_t orte_pmix_server_globals; diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index 6b3e5bde785..0f009d1a9f1 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -42,14 +42,126 @@ #include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/rml/base/rml_contact.h" #include "pmix_server_internal.h" +static int init_server(void) +{ + char *server; + opal_buffer_t buf; + char input[1024], *filename; + FILE *fp; + int rc; + + /* only do this once */ + orte_pmix_server_globals.pubsub_init = true; + + /* if the universal server wasn't specified, then we use + * our own HNP for that purpose */ + if (NULL == orte_pmix_server_globals.server_uri) { + orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP; + } else { + if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) || + 0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) { + /* it is a file - get the filename */ + filename = strchr(orte_pmix_server_globals.server_uri, ':'); + if (NULL == filename) { + /* filename is not correctly formatted */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, + orte_basename, orte_pmix_server_globals.server_uri); + return ORTE_ERR_BAD_PARAM; + } + ++filename; /* space past the : */ + + if (0 >= strlen(filename)) { + /* they forgot to give us the name! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, + orte_basename, orte_pmix_server_globals.server_uri); + return ORTE_ERR_BAD_PARAM; + } + + /* open the file and extract the uri */ + fp = fopen(filename, "r"); + if (NULL == fp) { /* can't find or read file! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, + orte_basename, orte_pmix_server_globals.server_uri); + return ORTE_ERR_BAD_PARAM; + } + if (NULL == fgets(input, 1024, fp)) { + /* something malformed about file */ + fclose(fp); + orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, + orte_basename, orte_pmix_server_globals.server_uri, + orte_basename); + return ORTE_ERR_BAD_PARAM; + } + fclose(fp); + input[strlen(input)-1] = '\0'; /* remove newline */ + server = strdup(input); + } else { + server = strdup(orte_pmix_server_globals.server_uri); + } + /* setup our route to the server */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + opal_dss.pack(&buf, &server, 1, OPAL_STRING); + if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { + ORTE_ERROR_LOG(rc); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + return rc; + } + OBJ_DESTRUCT(&buf); + /* parse the URI to get the server's name */ + if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(server, &orte_pmix_server_globals.server, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* check if we are to wait for the server to start - resolves + * a race condition that can occur when the server is run + * as a background job - e.g., in scripts + */ + if (orte_pmix_server_globals.wait_for_server) { + opal_output(0, "WAIT"); + /* ping the server */ + struct timeval timeout; + timeout.tv_sec = orte_pmix_server_globals.timeout; + timeout.tv_usec = 0; + if (ORTE_SUCCESS != (rc = orte_rml.ping(orte_mgmt_conduit, server, &timeout))) { + /* try it one more time */ + if (ORTE_SUCCESS != (rc = orte_rml.ping(orte_mgmt_conduit, server, &timeout))) { + /* okay give up */ + orte_show_help("help-orterun.txt", "orterun:server-not-found", true, + orte_basename, server, + (long)orte_pmix_server_globals.timeout, + ORTE_ERROR_NAME(rc)); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + return rc; + } + } + } + } + + opal_output(0, "SERVER READY"); + + return ORTE_SUCCESS; +} + static void execute(int sd, short args, void *cbdata) { pmix_server_req_t *req = (pmix_server_req_t*)cbdata; int rc; opal_buffer_t *xfer; + orte_process_name_t *target; + + if (!orte_pmix_server_globals.pubsub_init) { + /* we need to initialize our connection to the server */ + if (ORTE_SUCCESS != (rc = init_server())) { + orte_show_help("help-orted.txt", "noserver", true, + (NULL == orte_pmix_server_globals.server_uri) ? + "NULL" : orte_pmix_server_globals.server_uri); + goto callback; + } + } /* add this request to our tracker hotel */ if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) { @@ -67,9 +179,16 @@ static void execute(int sd, short args, void *cbdata) } opal_dss.copy_payload(xfer, &req->msg); + /* if the range is SESSION, then set the target to the global server */ + if (OPAL_PMIX_RANGE_SESSION == req->range) { + target = &orte_pmix_server_globals.server; + } else { + target = ORTE_PROC_MY_HNP; + } + /* send the request to the target */ rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, - &req->target, xfer, + target, xfer, ORTE_RML_TAG_DATA_SERVER, orte_rml_send_callback, NULL); if (ORTE_SUCCESS == rc) { @@ -95,7 +214,6 @@ int pmix_server_publish_fn(opal_process_name_t *proc, int rc; uint8_t cmd = ORTE_PMIX_PUBLISH_CMD; opal_value_t *iptr; - opal_pmix_data_range_t range = OPAL_PMIX_RANGE_SESSION; opal_pmix_persistence_t persist = OPAL_PMIX_PERSIST_APP; bool rset, pset; @@ -128,7 +246,7 @@ int pmix_server_publish_fn(opal_process_name_t *proc, pset = false; OPAL_LIST_FOREACH(iptr, info, opal_value_t) { if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE)) { - range = (opal_pmix_data_range_t)iptr->data.uint; + req->range = (opal_pmix_data_range_t)iptr->data.uint; if (pset) { break; } @@ -143,19 +261,12 @@ int pmix_server_publish_fn(opal_process_name_t *proc, } /* pack the range */ - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &range, 1, OPAL_PMIX_DATA_RANGE))) { + if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &req->range, 1, OPAL_PMIX_DATA_RANGE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(req); return rc; } - /* if the range is SESSION, then set the target to the global server */ - if (OPAL_PMIX_RANGE_SESSION == range) { - req->target = orte_pmix_server_globals.server; - } else { - req->target = *ORTE_PROC_MY_HNP; - } - /* pack the persistence */ if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &persist, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); @@ -205,7 +316,6 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys, uint8_t cmd = ORTE_PMIX_LOOKUP_CMD; int32_t nkeys, i; opal_value_t *iptr; - opal_pmix_data_range_t range = OPAL_PMIX_RANGE_SESSION; /* the list of info objects are directives for us - they include * things like timeout constraints, so there is no reason to @@ -234,25 +344,18 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys, /* no help for it - need to search for range */ OPAL_LIST_FOREACH(iptr, info, opal_value_t) { if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE)) { - range = (opal_pmix_data_range_t)iptr->data.uint; + req->range = (opal_pmix_data_range_t)iptr->data.uint; break; } } /* pack the range */ - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &range, 1, OPAL_PMIX_DATA_RANGE))) { + if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &req->range, 1, OPAL_PMIX_DATA_RANGE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(req); return rc; } - /* if the range is SESSION, then set the target to the global server */ - if (OPAL_PMIX_RANGE_SESSION == range) { - req->target = orte_pmix_server_globals.server; - } else { - req->target = *ORTE_PROC_MY_HNP; - } - /* pack the number of keys */ nkeys = opal_argv_count(keys); if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &nkeys, 1, OPAL_UINT32))) { @@ -309,7 +412,6 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys, uint8_t cmd = ORTE_PMIX_UNPUBLISH_CMD; uint32_t nkeys, n; opal_value_t *iptr; - opal_pmix_data_range_t range = OPAL_PMIX_RANGE_SESSION; /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); @@ -334,25 +436,18 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys, /* no help for it - need to search for range */ OPAL_LIST_FOREACH(iptr, info, opal_value_t) { if (0 == strcmp(iptr->key, OPAL_PMIX_RANGE)) { - range = (opal_pmix_data_range_t)iptr->data.integer; + req->range = (opal_pmix_data_range_t)iptr->data.integer; break; } } /* pack the range */ - if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &range, 1, OPAL_INT))) { + if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &req->range, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(req); return rc; } - /* if the range is SESSION, then set the target to the global server */ - if (OPAL_PMIX_RANGE_SESSION == range) { - req->target = orte_pmix_server_globals.server; - } else { - req->target = *ORTE_PROC_MY_HNP; - } - /* pack the number of keys */ nkeys = opal_argv_count(keys); if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &nkeys, 1, OPAL_UINT32))) { From 6a1bc308d2ffb3f4d96221c13b6565bdb07274dd Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 27 May 2017 10:47:08 -0700 Subject: [PATCH 17/29] Update the connect/accept support so we check to see if we have the proper infrastructure and RTE support, including whether we have ompi-server available if the connect/accept spans multiple applications. Print pretty help messages in all cases where we do not have support Signed-off-by: Ralph Castain (cherry picked from commit 9f60cd0fe7e3e089f1da9831deb69650dc7a5790) --- ompi/dpm/dpm.c | 9 +++++- ompi/mca/rte/orte/rte_orte.h | 5 +++- ompi/mca/rte/orte/rte_orte_module.c | 45 ++++++++++++++++++++++++++++ ompi/runtime/help-mpi-runtime.txt | 11 +++++++ opal/mca/pmix/base/base.h | 3 +- opal/mca/pmix/base/pmix_base_fns.c | 7 ++++- opal/mca/pmix/base/pmix_base_frame.c | 6 ++++ orte/orted/help-orted.txt | 4 ++- orte/orted/pmix/pmix_server.c | 8 ----- orte/orted/pmix/pmix_server_pub.c | 11 +++---- orte/runtime/orte_globals.c | 1 + orte/runtime/orte_globals.h | 1 + orte/runtime/orte_mca_params.c | 9 ++++++ orte/tools/orterun/help-orterun.txt | 15 ++++++++++ 14 files changed, 115 insertions(+), 20 deletions(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index d4346e417d7..81c76765e3f 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -15,7 +15,7 @@ * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -40,6 +40,7 @@ #include "opal/util/argv.h" #include "opal/util/opal_getcwd.h" #include "opal/util/proc.h" +#include "opal/util/show_help.h" #include "opal/dss/dss.h" #include "opal/mca/hwloc/base/base.h" #include "opal/mca/pmix/pmix.h" @@ -112,6 +113,12 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, if (NULL == opal_pmix.publish || NULL == opal_pmix.connect || NULL == opal_pmix.unpublish || (NULL == opal_pmix.lookup && NULL == opal_pmix.lookup_nb)) { + /* print a nice message explaining we don't have support */ + opal_show_help("help-mpi-runtime.txt", "noconxcpt", true); + return OMPI_ERR_NOT_SUPPORTED; + } + if (!ompi_rte_connect_accept_support(port_string)) { + /* they will have printed the help message */ return OMPI_ERR_NOT_SUPPORTED; } diff --git a/ompi/mca/rte/orte/rte_orte.h b/ompi/mca/rte/orte/rte_orte.h index b71a6e8323a..6c2ac8e40ff 100644 --- a/ompi/mca/rte/orte/rte_orte.h +++ b/ompi/mca/rte/orte/rte_orte.h @@ -1,7 +1,7 @@ /* * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -116,6 +116,9 @@ static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * nam } #endif +/* check dynamics support */ +OMPI_DECLSPEC bool ompi_rte_connect_accept_support(const char *port); + END_C_DECLS #endif /* MCA_OMPI_RTE_ORTE_H */ diff --git a/ompi/mca/rte/orte/rte_orte_module.c b/ompi/mca/rte/orte/rte_orte_module.c index aa4f5ad5a49..15248e82cb1 100644 --- a/ompi/mca/rte/orte/rte_orte_module.c +++ b/ompi/mca/rte/orte/rte_orte_module.c @@ -39,6 +39,7 @@ #include "orte/mca/routed/routed.h" #include "orte/util/name_fns.h" #include "orte/util/session_dir.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_data_server.h" @@ -190,3 +191,47 @@ void ompi_rte_wait_for_debugger(void) opal_pmix.deregister_evhandler(handler, NULL, NULL); } } + +bool ompi_rte_connect_accept_support(const char *port) +{ + char *ptr, *tmp; + orte_process_name_t name; + + /* were we launched by mpirun, or are we calling + * without a defined port? */ + if (NULL == orte_process_info.my_hnp_uri || + 0 == strlen(port)) { + return true; + } + + /* is the job family in the port different than my own? */ + tmp = strdup(port); // protect input + if (NULL == (ptr = strchr(tmp, ':'))) { + /* this port didn't come from us! */ + orte_show_help("help-orterun.txt", "orterun:malformedport", true); + free(tmp); + return false; + } + *ptr = '\0'; + if (ORTE_SUCCESS != orte_util_convert_string_to_process_name(&name, tmp)) { + free(tmp); + orte_show_help("help-orterun.txt", "orterun:malformedport", true); + return false; + } + free(tmp); + if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(name.jobid)) { + /* same job family, so our infrastructure is adequate */ + return true; + } + + /* if the job family of the port is different than our own + * and we were launched by mpirun, then we require ompi-server + * support */ + if (NULL == orte_data_server_uri) { + /* print a pretty help message */ + orte_show_help("help-orterun.txt", "orterun:server-unavailable", true); + return false; + } + + return true; +} diff --git a/ompi/runtime/help-mpi-runtime.txt b/ompi/runtime/help-mpi-runtime.txt index f2028417b98..ee0e29d6da0 100644 --- a/ompi/runtime/help-mpi-runtime.txt +++ b/ompi/runtime/help-mpi-runtime.txt @@ -12,6 +12,7 @@ # All rights reserved. # Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2013 NVIDIA Corporation. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -93,3 +94,13 @@ Open MPI with --enable-heterogeneous. [no cuda support] The user requested CUDA support with the --mca mpi_cuda_support 1 flag but the library was not compiled with any support. +# +[noconxcpt] +The user has called an operation involving MPI_Connect and/or MPI_Accept, +but this environment lacks the necessary infrastructure support for +that operation. Open MPI relies on the PMIx_Publish/Lookup (or one of +its predecessors) APIs for this operation. + +This typically happens when launching outside of mpirun where the underlying +resource manager does not provide publish/lookup support. One way of solving +the problem is to simply use mpirun to start the application. diff --git a/opal/mca/pmix/base/base.h b/opal/mca/pmix/base/base.h index dd64912c4a9..4c499ff5d1d 100644 --- a/opal/mca/pmix/base/base.h +++ b/opal/mca/pmix/base/base.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -57,6 +57,7 @@ OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase); typedef struct { opal_event_base_t *evbase; + int timeout; } opal_pmix_base_t; extern opal_pmix_base_t opal_pmix_base; diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index cb9e4ccf43f..d129cf1df0a 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -193,7 +193,12 @@ int opal_pmix_base_exchange(opal_value_t *indat, info = OBJ_NEW(opal_value_t); info->key = strdup(OPAL_PMIX_TIMEOUT); info->type = OPAL_INT; - info->data.integer = timeout; + if (0 < opal_pmix_base.timeout) { + /* the user has overridden the default */ + info->data.integer = opal_pmix_base.timeout; + } else { + info->data.integer = timeout; + } opal_list_append(&mlist, &info->super); /* if a non-blocking version of lookup isn't diff --git a/opal/mca/pmix/base/pmix_base_frame.c b/opal/mca/pmix/base/pmix_base_frame.c index 99d281fe722..f767391249c 100644 --- a/opal/mca/pmix/base/pmix_base_frame.c +++ b/opal/mca/pmix/base/pmix_base_frame.c @@ -47,6 +47,12 @@ static int opal_pmix_base_frame_register(mca_base_register_flag_t flags) (void) mca_base_var_register("opal", "pmix", "base", "collect_data", "Collect all data during modex", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_collect_all_data); + + opal_pmix_base.timeout = -1; + (void) mca_base_var_register("opal", "pmix", "base", "exchange_timeout", + "Time (in seconds) to wait for a data exchange to complete", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_base.timeout); return OPAL_SUCCESS; } diff --git a/orte/orted/help-orted.txt b/orte/orted/help-orted.txt index cec46c2d159..fa7e25b487b 100644 --- a/orte/orted/help-orted.txt +++ b/orte/orted/help-orted.txt @@ -67,7 +67,9 @@ A request has timed out and will therefore fail: Operation: %s Your job may terminate as a result of this problem. You may want to -adjust the MCA parameter pmix_server_max_wait and try again. +adjust the MCA parameter pmix_server_max_wait and try again. If this +occurred during a connect/accept operation, you can adjust that time +using the pmix_base_exchange_timeout parameter. # [noroom] A request for an asynchronous runtime operation cannot be fulfilled diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index 0ed02ce6b74..d443ee4c688 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -138,14 +138,6 @@ void pmix_server_register_params(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &orte_pmix_server_globals.timeout); - /* register the URI of the UNIVERSAL data server */ - orte_pmix_server_globals.server_uri = NULL; - (void) mca_base_var_register ("orte", "pmix", NULL, "server_uri", - "URI of a session-level keyval server for publish/lookup operations", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, - &orte_pmix_server_globals.server_uri); - /* whether or not to wait for the universal server */ orte_pmix_server_globals.wait_for_server = false; (void) mca_base_var_register ("orte", "pmix", NULL, "wait_for_server", diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index 0f009d1a9f1..f970b3b5909 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -59,13 +59,13 @@ static int init_server(void) /* if the universal server wasn't specified, then we use * our own HNP for that purpose */ - if (NULL == orte_pmix_server_globals.server_uri) { + if (NULL == orte_data_server_uri) { orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP; } else { - if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) || - 0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) { + if (0 == strncmp(orte_data_server_uri, "file", strlen("file")) || + 0 == strncmp(orte_data_server_uri, "FILE", strlen("FILE"))) { /* it is a file - get the filename */ - filename = strchr(orte_pmix_server_globals.server_uri, ':'); + filename = strchr(orte_data_server_uri, ':'); if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, @@ -121,7 +121,6 @@ static int init_server(void) * as a background job - e.g., in scripts */ if (orte_pmix_server_globals.wait_for_server) { - opal_output(0, "WAIT"); /* ping the server */ struct timeval timeout; timeout.tv_sec = orte_pmix_server_globals.timeout; @@ -141,8 +140,6 @@ static int init_server(void) } } - opal_output(0, "SERVER READY"); - return ORTE_SUCCESS; } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 68826c4abf0..678d1f66a2d 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -77,6 +77,7 @@ char *orte_coll_transport = NULL; int orte_mgmt_conduit = -1; int orte_coll_conduit = -1; bool orte_no_vm = false; +char *orte_data_server_uri = NULL; /* ORTE OOB port flags */ bool orte_static_ports = false; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 0b46dfc73db..eb1039edaa3 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -457,6 +457,7 @@ ORTE_DECLSPEC extern bool orte_coprocessors_detected; ORTE_DECLSPEC extern opal_hash_table_t *orte_coprocessors; ORTE_DECLSPEC extern char *orte_topo_signature; ORTE_DECLSPEC extern bool orte_no_vm; +ORTE_DECLSPEC extern char *orte_data_server_uri; /* ORTE OOB port flags */ ORTE_DECLSPEC extern bool orte_static_ports; diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 97735961383..3e642ac5bb6 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -788,5 +788,14 @@ int orte_register_params(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_fwd_mpirun_port); + /* register the URI of the UNIVERSAL data server */ + orte_data_server_uri = NULL; + (void) mca_base_var_register ("orte", "pmix", NULL, "server_uri", + "URI of a session-level keyval server for publish/lookup operations", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL, + &orte_data_server_uri); + + return ORTE_SUCCESS; } diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index ff49f2e786b..2b006f005af 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -458,6 +458,21 @@ Error received: %s Please check to ensure that the requested server matches the actual server information, and that the server is in operation. # +[orterun:server-unavailable] +The user has called an operation involving MPI_Connect and/or MPI_Accept +that spans multiple invocations of mpirun. This requires the support of +the ompi-server tool, which must be executing somewhere that can be +accessed by all participants. + +Please ensure the tool is running, and provide each mpirun with the MCA +parameter "pmix_server_uri" pointing to it. +# +[orterun:malformedport] +An operation involving MPI_Connect and/or MPI_Accept was called with +an unrecognized port string. This typically happens when passing the +string on a cmd line and failing to properly quote it to protect +against the special characters it includes +# [orterun:ompi-server-pid-bad] %s was unable to parse the PID of the %s to be used as the ompi-server. The option we were given was: From 27e8e3bbda22f608429f8ddd2820f0f02226ceac Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 27 May 2017 11:45:53 -0700 Subject: [PATCH 18/29] Silence coverity warnings Signed-off-by: Ralph Castain (cherry picked from commit 87201a80ff9adc010bf2193641c2ff2920e2b22c) --- orte/mca/rmaps/base/rmaps_base_ranking.c | 3 ++- orte/orted/orted_comm.c | 10 ++++++++++ orte/util/nidmap.c | 8 +++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/orte/mca/rmaps/base/rmaps_base_ranking.c b/orte/mca/rmaps/base/rmaps_base_ranking.c index cb5d6a09a0c..8be87fa50e1 100644 --- a/orte/mca/rmaps/base/rmaps_base_ranking.c +++ b/orte/mca/rmaps/base/rmaps_base_ranking.c @@ -413,7 +413,8 @@ static int rank_by(orte_job_t *jdata, return ORTE_ERROR; } /* ignore procs not on this object */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + if (NULL == locale || + !hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_by: proc at position %d is not on object %d", j, i); diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index 4b5b7932c0e..880615c0a36 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -663,24 +663,32 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(ret); free(cmpdata); OBJ_DESTRUCT(&data); + OBJ_RELEASE(answer); + goto CLEANUP; } /* pack the compressed length */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &cmplen, 1, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); free(cmpdata); OBJ_DESTRUCT(&data); + OBJ_RELEASE(answer); + goto CLEANUP; } /* pack the uncompressed length */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &data.bytes_used, 1, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); free(cmpdata); OBJ_DESTRUCT(&data); + OBJ_RELEASE(answer); + goto CLEANUP; } /* pack the compressed info */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, cmpdata, cmplen, OPAL_UINT8))) { ORTE_ERROR_LOG(ret); free(cmpdata); OBJ_DESTRUCT(&data); + OBJ_RELEASE(answer); + goto CLEANUP; } OBJ_DESTRUCT(&data); free(cmpdata); @@ -691,6 +699,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&data); free(cmpdata); + OBJ_RELEASE(answer); + goto CLEANUP; } /* transfer the payload across */ opal_dss.copy_payload(answer, &data); diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index ca4948fcbca..cba8139224d 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -1285,9 +1285,11 @@ int orte_util_nidmap_generate_ppn(orte_job_t *jdata, char **ppn) } } OPAL_LIST_DESTRUCT(&prk[n]); // releases all the actives objects - opal_argv_append_nosize(&cache, ptmp); - free(ptmp); - ptmp = NULL; + if (NULL != ptmp) { + opal_argv_append_nosize(&cache, ptmp); + free(ptmp); + ptmp = NULL; + } } free(prk); free(cnt); From 46a9f7c93bb8759f08ac4d36dde9b9f8a1d166b3 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 28 May 2017 10:30:58 -0700 Subject: [PATCH 19/29] Update to PMIx v2.0.0rc1 Signed-off-by: Ralph Castain (cherry picked from commit 9f1f9d66069c500723205d3ab0401a1b1ebbef54) --- opal/mca/pmix/pmix2x/pmix/VERSION | 6 +- .../pmix/pmix2x/pmix/include/pmix_common.h | 249 +++++++++++++++++- .../pmix2x/pmix/src/buffer_ops/open_close.c | 8 +- .../pmix2x/pmix/src/common/Makefile.include | 3 +- .../pmix/pmix2x/pmix/src/common/pmix_data.c | 159 +++++++++++ .../pmix2x/pmix/src/common/pmix_jobdata.c | 22 +- .../pmix/pmix2x/pmix/src/event/pmix_event.h | 80 +++++- .../pmix/src/event/pmix_event_notification.c | 19 ++ .../pmix/src/runtime/help-pmix-runtime.txt | 42 +-- .../pmix2x/pmix/src/runtime/pmix_params.c | 16 ++ .../pmix/pmix2x/pmix/src/runtime/pmix_rte.h | 4 +- .../pmix/pmix2x/pmix/test/simple/simpdie.c | 159 +++++++++++ 12 files changed, 703 insertions(+), 64 deletions(-) create mode 100644 opal/mca/pmix/pmix2x/pmix/src/common/pmix_data.c create mode 100644 opal/mca/pmix/pmix2x/pmix/test/simple/simpdie.c diff --git a/opal/mca/pmix/pmix2x/pmix/VERSION b/opal/mca/pmix/pmix2x/pmix/VERSION index b7a91495220..4de2c58255e 100644 --- a/opal/mca/pmix/pmix2x/pmix/VERSION +++ b/opal/mca/pmix/pmix2x/pmix/VERSION @@ -23,14 +23,14 @@ release=0 # The only requirement is that it must be entirely printable ASCII # characters and have no white space. -greek= +greek=a1 # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=git198a2b0 +repo_rev=git223d70e # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Apr 12, 2017" +date="Nov 09, 2016" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h index 7bc9a8ce89a..74e4d7b6169 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h +++ b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h @@ -685,6 +685,44 @@ typedef struct pmix_byte_object { } pmix_byte_object_t; +/**** PMIX DATA BUFFER ****/ +typedef struct pmix_data_buffer { + /** Start of my memory */ + char *base_ptr; + /** Where the next data will be packed to (within the allocated + memory starting at base_ptr) */ + char *pack_ptr; + /** Where the next data will be unpacked from (within the + allocated memory starting as base_ptr) */ + char *unpack_ptr; + /** Number of bytes allocated (starting at base_ptr) */ + size_t bytes_allocated; + /** Number of bytes used by the buffer (i.e., amount of data -- + including overhead -- packed in the buffer) */ + size_t bytes_used; +} pmix_data_buffer_t; +#define PMIX_DATA_BUFFER_CREATE(m) \ + do { \ + (m) = (pmix_data_buffer_t*)calloc(1, sizeof(pmix_data_buffer_t)); \ + } while (0) +#define PMIX_DATA_BUFFER_RELEASE(m) \ + do { \ + if (NULL != (m)->base_ptr) { \ + free((m)->base_ptr); \ + } \ + free((m)); \ + (m) = NULL; \ + } while (0) +#define PMIX_DATA_BUFFER_CONSTRUCT(m) \ + memset((m), 0, sizeof(pmix_data_buffer_t)) +#define PMIX_DATA_BUFFER_DESTRUCT(m) \ + do { \ + if (NULL != (m)->base_ptr) { \ + free((m)->base_ptr); \ + } \ + } while (0) + + /**** PMIX PROC OBJECT ****/ typedef struct pmix_proc { char nspace[PMIX_MAX_NSLEN+1]; @@ -695,9 +733,10 @@ typedef struct pmix_proc { (m) = (pmix_proc_t*)calloc((n) , sizeof(pmix_proc_t)); \ } while (0) -#define PMIX_PROC_RELEASE(m) \ - do { \ - PMIX_PROC_FREE((m)); \ +#define PMIX_PROC_RELEASE(m) \ + do { \ + free((m)); \ + (m) = NULL; \ } while (0) #define PMIX_PROC_CONSTRUCT(m) \ @@ -952,7 +991,6 @@ pmix_status_t pmix_setenv(const char *name, const char *value, #define PMIX_SETENV(a, b, c) \ pmix_setenv((a), (b), true, (c)) - /**** PMIX INFO STRUCT ****/ struct pmix_info_t { char key[PMIX_MAX_KEYLEN+1]; // ensure room for the NULL terminator @@ -1487,6 +1525,209 @@ pmix_status_t PMIx_Store_internal(const pmix_proc_t *proc, const char *key, pmix_value_t *val); +/** + * Top-level interface function to pack one or more values into a + * buffer. + * + * The pack function packs one or more values of a specified type into + * the specified buffer. The buffer must have already been + * initialized via the PMIX_DATA_BUFFER_CREATE or PMIX_DATA_BUFFER_CONSTRUCT + * call - otherwise, the pack_value function will return an error. + * Providing an unsupported type flag will likewise be reported as an error. + * + * Note that any data to be packed that is not hard type cast (i.e., + * not type cast to a specific size) may lose precision when unpacked + * by a non-homogeneous recipient. The PACK function will do its best to deal + * with heterogeneity issues between the packer and unpacker in such + * cases. Sending a number larger than can be handled by the recipient + * will return an error code (generated upon unpacking) - + * the error cannot be detected during packing. + * + * @param *buffer A pointer to the buffer into which the value is to + * be packed. + * + * @param *src A void* pointer to the data that is to be packed. Note + * that strings are to be passed as (char **) - i.e., the caller must + * pass the address of the pointer to the string as the void*. This + * allows PMIx to use a single pack function, but still allow + * the caller to pass multiple strings in a single call. + * + * @param num_values An int32_t indicating the number of values that are + * to be packed, beginning at the location pointed to by src. A string + * value is counted as a single value regardless of length. The values + * must be contiguous in memory. Arrays of pointers (e.g., string + * arrays) should be contiguous, although (obviously) the data pointed + * to need not be contiguous across array entries. + * + * @param type The type of the data to be packed - must be one of the + * PMIX defined data types. + * + * @retval PMIX_SUCCESS The data was packed as requested. + * + * @retval PMIX_ERROR(s) An appropriate PMIX error code indicating the + * problem encountered. This error code should be handled + * appropriately. + * + * @code + * pmix_data_buffer_t *buffer; + * int32_t src; + * + * PMIX_DATA_BUFFER_CREATE(buffer); + * status_code = PMIx_Data_pack(buffer, &src, 1, PMIX_INT32); + * @endcode + */ +pmix_status_t PMIx_Data_pack(pmix_data_buffer_t *buffer, + void *src, int32_t num_vals, + pmix_data_type_t type); + +/** + * Unpack values from a buffer. + * + * The unpack function unpacks the next value (or values) of a + * specified type from the specified buffer. + * + * The buffer must have already been initialized via an PMIX_DATA_BUFFER_CREATE or + * PMIX_DATA_BUFFER_CONSTRUCT call (and assumedly filled with some data) - + * otherwise, the unpack_value function will return an + * error. Providing an unsupported type flag will likewise be reported + * as an error, as will specifying a data type that DOES NOT match the + * type of the next item in the buffer. An attempt to read beyond the + * end of the stored data held in the buffer will also return an + * error. + * + * NOTE: it is possible for the buffer to be corrupted and that + * PMIx will *think* there is a proper variable type at the + * beginning of an unpack region - but that the value is bogus (e.g., just + * a byte field in a string array that so happens to have a value that + * matches the specified data type flag). Therefore, the data type error check + * is NOT completely safe. This is true for ALL unpack functions. + * + * + * Unpacking values is a "nondestructive" process - i.e., the values are + * not removed from the buffer. It is therefore possible for the caller + * to re-unpack a value from the same buffer by resetting the unpack_ptr. + * + * Warning: The caller is responsible for providing adequate memory + * storage for the requested data. As noted below, the user + * must provide a parameter indicating the maximum number of values that + * can be unpacked into the allocated memory. If more values exist in the + * buffer than can fit into the memory storage, then the function will unpack + * what it can fit into that location and return an error code indicating + * that the buffer was only partially unpacked. + * + * Note that any data that was not hard type cast (i.e., not type cast + * to a specific size) when packed may lose precision when unpacked by + * a non-homogeneous recipient. PMIx will do its best to deal with + * heterogeneity issues between the packer and unpacker in such + * cases. Sending a number larger than can be handled by the recipient + * will return an error code generated upon unpacking - these errors + * cannot be detected during packing. + * + * @param *buffer A pointer to the buffer from which the value will be + * extracted. + * + * @param *dest A void* pointer to the memory location into which the + * data is to be stored. Note that these values will be stored + * contiguously in memory. For strings, this pointer must be to (char + * **) to provide a means of supporting multiple string + * operations. The unpack function will allocate memory for each + * string in the array - the caller must only provide adequate memory + * for the array of pointers. + * + * @param type The type of the data to be unpacked - must be one of + * the BFROP defined data types. + * + * @retval *max_num_values The number of values actually unpacked. In + * most cases, this should match the maximum number provided in the + * parameters - but in no case will it exceed the value of this + * parameter. Note that if you unpack fewer values than are actually + * available, the buffer will be in an unpackable state - the function will + * return an error code to warn of this condition. + * + * @note The unpack function will return the actual number of values + * unpacked in this location. + * + * @retval PMIX_SUCCESS The next item in the buffer was successfully + * unpacked. + * + * @retval PMIX_ERROR(s) The unpack function returns an error code + * under one of several conditions: (a) the number of values in the + * item exceeds the max num provided by the caller; (b) the type of + * the next item in the buffer does not match the type specified by + * the caller; or (c) the unpack failed due to either an error in the + * buffer or an attempt to read past the end of the buffer. + * + * @code + * pmix_data_buffer_t *buffer; + * int32_t dest; + * char **string_array; + * int32_t num_values; + * + * num_values = 1; + * status_code = PMIx_Data_unpack(buffer, (void*)&dest, &num_values, PMIX_INT32); + * + * num_values = 5; + * string_array = malloc(num_values*sizeof(char *)); + * status_code = PMIx_Data_unpack(buffer, (void*)(string_array), &num_values, PMIX_STRING); + * + * @endcode + */ +pmix_status_t PMIx_Data_unpack(pmix_data_buffer_t *buffer, void *dest, + int32_t *max_num_values, + pmix_data_type_t type); + +/** + * Copy a data value from one location to another. + * + * Since registered data types can be complex structures, the system + * needs some way to know how to copy the data from one location to + * another (e.g., for storage in the registry). This function, which + * can call other copy functions to build up complex data types, defines + * the method for making a copy of the specified data type. + * + * @param **dest The address of a pointer into which the + * address of the resulting data is to be stored. + * + * @param *src A pointer to the memory location from which the + * data is to be copied. + * + * @param type The type of the data to be copied - must be one of + * the PMIx defined data types. + * + * @retval PMIX_SUCCESS The value was successfully copied. + * + * @retval PMIX_ERROR(s) An appropriate error code. + * + */ +pmix_status_t PMIx_Data_copy(void **dest, void *src, pmix_data_type_t type); + +/** + * Print a data value. + * + * Since registered data types can be complex structures, the system + * needs some way to know how to print them (i.e., convert them to a string + * representation). Provided for debug purposes. + * + * @retval PMIX_SUCCESS The value was successfully printed. + * + * @retval PMIX_ERROR(s) An appropriate error code. + */ +pmix_status_t PMIx_Data_print(char **output, char *prefix, + void *src, pmix_data_type_t type); + +/** + * Copy a payload from one buffer to another + * + * This function will append a copy of the payload in one buffer into + * another buffer. + * NOTE: This is NOT a destructive procedure - the + * source buffer's payload will remain intact, as will any pre-existing + * payload in the destination's buffer. + */ +pmix_status_t PMIx_Data_copy_payload(pmix_data_buffer_t *dest, + pmix_data_buffer_t *src); + + /* Key-Value pair management macros */ // TODO: add all possible types/fields here. diff --git a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/open_close.c b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/open_close.c index 486d6f25549..ddb48071db4 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/open_close.c +++ b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/open_close.c @@ -14,7 +14,7 @@ * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2016 IBM Corporation. All rights reserved. + * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,7 +46,7 @@ pmix_pointer_array_t pmix_bfrop_types = {{0}}; pmix_data_type_t pmix_bfrop_num_reg_types = PMIX_UNDEF; static pmix_bfrop_buffer_type_t pmix_default_buf_type = PMIX_BFROP_BUFFER_NON_DESC; -pmix_bfrop_t pmix_bfrop = { +PMIX_EXPORT pmix_bfrop_t pmix_bfrop = { pmix_bfrop_pack, pmix_bfrop_unpack, pmix_bfrop_copy, @@ -149,7 +149,7 @@ PMIX_CLASS_INSTANCE(pmix_regex_value_t, pmix_list_item_t, rvcon, rvdes); -pmix_status_t pmix_bfrop_open(void) +PMIX_EXPORT pmix_status_t pmix_bfrop_open(void) { pmix_status_t rc; @@ -445,7 +445,7 @@ pmix_status_t pmix_bfrop_open(void) } -pmix_status_t pmix_bfrop_close(void) +PMIX_EXPORT pmix_status_t pmix_bfrop_close(void) { int32_t i; diff --git a/opal/mca/pmix/pmix2x/pmix/src/common/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/common/Makefile.include index 6a566f58a4b..e8b9a46a62d 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/common/Makefile.include +++ b/opal/mca/pmix/pmix2x/pmix/src/common/Makefile.include @@ -14,4 +14,5 @@ sources += \ common/pmix_strings.c \ common/pmix_log.c \ common/pmix_jobdata.c \ - common/pmix_control.c + common/pmix_control.c \ + common/pmix_data.c diff --git a/opal/mca/pmix/pmix2x/pmix/src/common/pmix_data.c b/opal/mca/pmix/pmix2x/pmix/src/common/pmix_data.c new file mode 100644 index 00000000000..a10f4057cc2 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/common/pmix_data.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2012 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + + +#ifdef HAVE_STRING_H +#include +#endif +#include +#include +#ifdef HAVE_STDLIB_H +#include +#endif + +#include +#include + +#include "src/buffer_ops/buffer_ops.h" + +#define PMIX_EMBED_DATA_BUFFER(b, db) \ + do { \ + (b)->base_ptr = (db)->base_ptr; \ + (b)->pack_ptr = (db)->pack_ptr; \ + (b)->unpack_ptr = (db)->unpack_ptr; \ + (b)->bytes_allocated = (db)->bytes_allocated; \ + (b)->bytes_used = (db)->bytes_used; \ + (db)->base_ptr = NULL; \ + (db)->pack_ptr = NULL; \ + (db)->unpack_ptr = NULL; \ + (db)->bytes_allocated = 0; \ + (db)->bytes_used = 0; \ + } while (0) + +#define PMIX_EXTRACT_DATA_BUFFER(b, db) \ + do { \ + (db)->base_ptr = (b)->base_ptr; \ + (db)->pack_ptr = (b)->pack_ptr; \ + (db)->unpack_ptr = (b)->unpack_ptr; \ + (db)->bytes_allocated = (b)->bytes_allocated; \ + (db)->bytes_used = (b)->bytes_used; \ + (b)->base_ptr = NULL; \ + (b)->pack_ptr = NULL; \ + (b)->unpack_ptr = NULL; \ + (b)->bytes_allocated = 0; \ + (b)->bytes_used = 0; \ + } while (0) + +PMIX_EXPORT pmix_status_t PMIx_Data_pack(pmix_data_buffer_t *buffer, + void *src, int32_t num_vals, + pmix_data_type_t type) +{ + pmix_status_t rc; + pmix_buffer_t buf; + + /* setup the host */ + PMIX_CONSTRUCT(&buf, pmix_buffer_t); + + /* embed the data buffer into a buffer */ + PMIX_EMBED_DATA_BUFFER(&buf, buffer); + + /* pack the value */ + rc = pmix_bfrop.pack(&buf, src, num_vals, type); + + /* extract the data buffer - the pointers may have changed */ + PMIX_EXTRACT_DATA_BUFFER(&buf, buffer); + + /* no need to cleanup as all storage was xfered */ + return rc; +} + + +PMIX_EXPORT pmix_status_t PMIx_Data_unpack(pmix_data_buffer_t *buffer, void *dest, + int32_t *max_num_values, + pmix_data_type_t type) +{ + pmix_status_t rc; + pmix_buffer_t buf; + + /* setup the host */ + PMIX_CONSTRUCT(&buf, pmix_buffer_t); + + /* embed the data buffer into a buffer */ + PMIX_EMBED_DATA_BUFFER(&buf, buffer); + + /* unpack the value */ + rc = pmix_bfrop.unpack(&buf, dest, max_num_values, type); + + /* extract the data buffer - the pointers may have changed */ + PMIX_EXTRACT_DATA_BUFFER(&buf, buffer); + + /* no need to cleanup as all storage was xfered */ + return rc; +} + +PMIX_EXPORT pmix_status_t PMIx_Data_copy(void **dest, void *src, + pmix_data_type_t type) +{ + pmix_status_t rc; + + /* copy the value */ + rc = pmix_bfrop.copy(dest, src, type); + + return rc; +} + +PMIX_EXPORT pmix_status_t PMIx_Data_print(char **output, char *prefix, + void *src, pmix_data_type_t type) +{ + pmix_status_t rc; + + /* print the value */ + rc = pmix_bfrop.print(output, prefix, src, type); + + return rc; +} + +PMIX_EXPORT pmix_status_t PMIx_Data_copy_payload(pmix_data_buffer_t *dest, + pmix_data_buffer_t *src) +{ + pmix_status_t rc; + pmix_buffer_t buf1, buf2; + + /* setup the hosts */ + PMIX_CONSTRUCT(&buf1, pmix_buffer_t); + PMIX_CONSTRUCT(&buf2, pmix_buffer_t); + + /* embed the data buffer into a buffer */ + PMIX_EMBED_DATA_BUFFER(&buf1, dest); + PMIX_EMBED_DATA_BUFFER(&buf2, src); + + /* copy payload */ + rc = pmix_bfrop.copy_payload(&buf1, &buf2); + + /* extract the dest data buffer - the pointers may have changed */ + PMIX_EXTRACT_DATA_BUFFER(&buf1, dest); + PMIX_EXTRACT_DATA_BUFFER(&buf2, src); + + /* no need to cleanup as all storage was xfered */ + return rc; +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/common/pmix_jobdata.c b/opal/mca/pmix/pmix2x/pmix/src/common/pmix_jobdata.c index a1c2fd57119..4ca58d6acf7 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/common/pmix_jobdata.c +++ b/opal/mca/pmix/pmix2x/pmix/src/common/pmix_jobdata.c @@ -20,6 +20,8 @@ #include "src/util/argv.h" #include "src/util/compress.h" #include "src/util/hash.h" +#include "src/util/show_help.h" +#include "src/runtime/pmix_rte.h" #include "src/include/pmix_jobdata.h" #if defined(PMIX_ENABLE_DSTORE) && (PMIX_ENABLE_DSTORE == 1) @@ -77,6 +79,7 @@ static inline int _rank_key_dstore_store(void *cbdata) pmix_job_data_caddy_t *cb = (pmix_job_data_caddy_t*)cbdata; pmix_rank_t rank; pmix_kval_t *kv = NULL; + bool flag = true; if (NULL == cb->bufs) { rc = PMIX_ERR_BAD_PARAM; @@ -93,9 +96,22 @@ static inline int _rank_key_dstore_store(void *cbdata) tmp = &(PMIX_VALUE_ARRAY_GET_ITEM(cb->bufs, pmix_buffer_t, i)); rank = 0 == i ? PMIX_RANK_WILDCARD : i - 1; PMIX_UNLOAD_BUFFER(tmp, kv->value->data.bo.bytes, kv->value->data.bo.size); - if (PMIX_SUCCESS != (rc = cb->dstore_fn(cb->nsptr->nspace, rank, kv))) { - PMIX_ERROR_LOG(rc); - goto exit; + if (NULL == kv->value->data.bo.bytes) { + if (flag && !pmix_suppress_missing_data_warning) { + /* this occurs if the host RM did _not_ provide us with + * data for every process in the job, in non-compliance + * with the PMIx standard. Warn the user that their job + * may not scale as desired, and give them a way to turn + * that warning off in case the RM just can't do it */ + pmix_show_help("help-pmix-runtime.txt", "missingdata", true); + /* only show this once */ + flag = false; + } + } else { + if (PMIX_SUCCESS != (rc = cb->dstore_fn(cb->nsptr->nspace, rank, kv))) { + PMIX_ERROR_LOG(rc); + goto exit; + } } } diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event.h b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event.h index e9ebd333181..55f3fac311f 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event.h +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event.h @@ -22,6 +22,8 @@ #define PMIX_EVENT_H #include +#include "src/include/types.h" +#include PMIX_EVENT_HEADER #include #include "src/class/pmix_list.h" @@ -92,8 +94,10 @@ PMIX_CLASS_DECLARATION(pmix_events_t); * means for us to relay the event across that chain */ typedef struct pmix_event_chain_t { - pmix_object_t super; + pmix_list_item_t super; pmix_status_t status; + pmix_event_t ev; + bool timer_active; bool nondefault; bool endchain; pmix_proc_t source; @@ -120,19 +124,67 @@ pmix_status_t pmix_server_notify_client_of_event(pmix_status_t status, pmix_info_t info[], size_t ninfo, pmix_op_cbfunc_t cbfunc, void *cbdata); -#define PMIX_REPORT_EVENT(e, f) \ - do { \ - pmix_event_chain_t *_ch; \ - _ch = PMIX_NEW(pmix_event_chain_t); \ - _ch->status = (e); \ - _ch->ninfo = 1; \ - _ch->final_cbfunc = (f); \ - _ch->final_cbdata = _ch; \ - PMIX_INFO_CREATE(_ch->info, _ch->ninfo); \ - PMIX_INFO_LOAD(&_ch->info[0], \ - PMIX_EVENT_RETURN_OBJECT, \ - NULL, PMIX_POINTER); \ - pmix_invoke_local_event_hdlr(_ch); \ +void pmix_event_timeout_cb(int fd, short flags, void *arg); + +#define PMIX_REPORT_EVENT(e, p, r, f) \ + do { \ + pmix_event_chain_t *ch, *cp; \ + size_t n, ninfo; \ + pmix_info_t *info; \ + pmix_proc_t proc; \ + \ + ch = NULL; \ + /* see if we already have this event cached */ \ + PMIX_LIST_FOREACH(cp, &pmix_globals.cached_events, pmix_event_chain_t) { \ + if (cp->status == (e)) { \ + ch = cp; \ + break; \ + } \ + } \ + if (NULL == ch) { \ + /* nope - need to add it */ \ + ch = PMIX_NEW(pmix_event_chain_t); \ + ch->status = (e); \ + ch->range = (r); \ + (void)strncpy(ch->source.nspace, \ + (p)->info->nptr->nspace, \ + PMIX_MAX_NSLEN); \ + ch->source.rank = (p)->info->rank; \ + ch->ninfo = 2; \ + ch->final_cbfunc = (f); \ + ch->final_cbdata = ch; \ + PMIX_INFO_CREATE(ch->info, ch->ninfo); \ + PMIX_INFO_LOAD(&ch->info[0], \ + PMIX_EVENT_HDLR_NAME, \ + NULL, PMIX_STRING); \ + PMIX_INFO_LOAD(&ch->info[1], \ + PMIX_EVENT_RETURN_OBJECT, \ + NULL, PMIX_POINTER); \ + /* cache it */ \ + pmix_list_append(&pmix_globals.cached_events, &ch->super); \ + ch->timer_active = true; \ + pmix_event_assign(&ch->ev, pmix_globals.evbase, -1, 0, \ + pmix_event_timeout_cb, ch); \ + pmix_event_add(&ch->ev, &pmix_globals.event_window); \ + } else { \ + /* add this peer to the array of sources */ \ + (void)strncpy(proc.nspace, (p)->info->nptr->nspace, PMIX_MAX_NSLEN); \ + proc.rank = (p)->info->rank; \ + ninfo = ch->ninfo + 1; \ + PMIX_INFO_CREATE(info, ninfo); \ + /* must keep the hdlr name and return object at the end, so prepend */ \ + PMIX_INFO_LOAD(&info[0], PMIX_PROCID, \ + &proc, PMIX_PROC); \ + for (n=0; n < ch->ninfo; n++) { \ + PMIX_INFO_XFER(&info[n+1], &ch->info[n]); \ + } \ + PMIX_INFO_FREE(ch->info, ch->ninfo); \ + ch->info = info; \ + ch->ninfo = ninfo; \ + /* reset the timer */ \ + pmix_event_del(&ch->ev); \ + pmix_event_add(&ch->ev, &pmix_globals.event_window); \ + } \ } while(0) diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c index 83474169fd0..f0ebe09269b 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c @@ -3,6 +3,8 @@ * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -657,6 +659,23 @@ pmix_status_t pmix_server_notify_client_of_event(pmix_status_t status, } } } + /* + * If the range is PMIX_RANGE_NAMESPACE, then they should not have set a + * PMIX_EVENT_CUSTOM_RANGE info object or at least we should ignore it + */ + if (PMIX_RANGE_NAMESPACE == cd->range) { + if (cd->targets) { + PMIX_PROC_FREE(cd->targets, cd->ntargets); + } + PMIX_PROC_CREATE(cd->targets, 1); + cd->ntargets = 1; + cd->targets[0].rank = PMIX_RANK_WILDCARD; + if (NULL == source) { + strncpy(cd->targets[0].nspace, "UNDEF", PMIX_MAX_NSLEN); + } else { + strncpy(cd->targets[0].nspace, source->nspace, PMIX_MAX_NSLEN); + } + } /* pack the command */ if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(cd->buf, &cmd, 1, PMIX_CMD))) { diff --git a/opal/mca/pmix/pmix2x/pmix/src/runtime/help-pmix-runtime.txt b/opal/mca/pmix/pmix2x/pmix/src/runtime/help-pmix-runtime.txt index 01b5a842273..3f78275d446 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/runtime/help-pmix-runtime.txt +++ b/opal/mca/pmix/pmix2x/pmix/src/runtime/help-pmix-runtime.txt @@ -12,6 +12,7 @@ # All rights reserved. # Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. # Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -31,38 +32,11 @@ PMIX developer): %s failed --> Returned value %d instead of PMIX_SUCCESS # -[pmix_cr_init:no-crs] -It looks like pmix_cr_init failed. This usually means that the CRS component -could not be activated on this machine. Check the installation of your -checkpointer, MCA parameters, and configuration. If all of that seems -correct, then copy this error message with the additional information below -to the PMIX users list. - Function: %s - Return value: %d -# -# Just want a clean printout for sys limit as the -# message was already generated by show-help -[pmix_init:syslimit] -%s -# -[pmix_init:warn-fork] -A process has executed an operation involving a call to the -"fork()" system call to create a child process. PMIX is currently -operating in a condition that could result in memory corruption or -other system errors; your job may hang, crash, or produce silent -data corruption. The use of fork() (or system() or other calls that -create child processes) is strongly discouraged. - -The process that invoked fork was: +[missingdata] +PMIx has detected that the host RM failed to provide all the job-level +information specified by the PMIx standard. This is not necessarily +a fatal situation, but may negatively impact your launch performance. - Local host: %s (PID %d) - -If you are *absolutely sure* that your application will successfully -and correctly survive a call to fork(), you may disable this warning -by setting the mpi_warn_on_fork MCA parameter to 0. -# -[mpi-params:leave-pinned-and-pipeline-selected] -WARNING: Cannot set both the MCA parameters pmix_leave_pinned (a.k.a., -mpi_leave_pinned) and pmix_leave_pinned_pipeline (a.k.a., -mpi_leave_pinned_pipeline) to "true". Defaulting to mpi_leave_pinned -ONLY. +If you feel you have received this warning in error, or wish to ignore +it in the future, you can disable it by setting the PMIx MCA parameter +"pmix_suppress_missing_data_warning=1" diff --git a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_params.c b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_params.c index e2c60025bb8..c0a40f98d7f 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_params.c +++ b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_params.c @@ -43,6 +43,8 @@ bool pmix_timing_overhead = true; static bool pmix_register_done = false; char *pmix_net_private_ipv4 = NULL; +int pmix_event_caching_window = 1; +bool pmix_suppress_missing_data_warning = false; pmix_status_t pmix_register_params(void) { @@ -90,6 +92,20 @@ pmix_status_t pmix_register_params(void) return ret; } + (void) pmix_mca_base_var_register ("pmix", "pmix", NULL, "event_caching_window", + "Time (in seconds) to aggregate events before reporting them - this " + "suppresses event cascades when processes abnormally terminate", + PMIX_MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + PMIX_INFO_LVL_1, PMIX_MCA_BASE_VAR_SCOPE_ALL, + &pmix_event_caching_window); + + (void) pmix_mca_base_var_register ("pmix", "pmix", NULL, "suppress_missing_data_warning", + "Suppress warning that PMIx is missing job-level data that " + "is supposed to be provided by the host RM.", + PMIX_MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + PMIX_INFO_LVL_1, PMIX_MCA_BASE_VAR_SCOPE_ALL, + &pmix_suppress_missing_data_warning); + return PMIX_SUCCESS; } diff --git a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_rte.h b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_rte.h index 0ef36e271e4..74f590c53de 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_rte.h +++ b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_rte.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,6 +46,8 @@ extern bool pmix_timing_overhead; extern int pmix_initialized; extern char *pmix_net_private_ipv4; +extern int pmix_event_caching_window; +extern bool pmix_suppress_missing_data_warning; /** version string of pmix */ extern const char pmix_version_string[]; diff --git a/opal/mca/pmix/pmix2x/pmix/test/simple/simpdie.c b/opal/mca/pmix/pmix2x/pmix/test/simple/simpdie.c new file mode 100644 index 00000000000..1949e3e391a --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/test/simple/simpdie.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include +#include + +#include +#include +#include +#include + +#include "src/class/pmix_object.h" +#include "src/buffer_ops/types.h" +#include "src/util/argv.h" +#include "src/util/output.h" +#include "src/util/printf.h" + +static pmix_proc_t myproc; +static bool completed; + +static void notification_fn(size_t evhdlr_registration_id, + pmix_status_t status, + const pmix_proc_t *source, + pmix_info_t info[], size_t ninfo, + pmix_info_t results[], size_t nresults, + pmix_event_notification_cbfunc_fn_t cbfunc, + void *cbdata) +{ + size_t n; + + pmix_output(0, "Client %s:%d NOTIFIED with status %d source %s:%d and %d info", + myproc.nspace, myproc.rank, status, source->nspace, source->rank, (int)ninfo); + for (n=0; n < ninfo; n++) { + if (0 == strncmp(info[n].key, PMIX_PROCID, PMIX_MAX_KEYLEN) && + PMIX_PROC == info[n].value.type) { + pmix_output(0, "[%s:%d] added proc: %s:%d", myproc.nspace, myproc.rank, + info[n].value.data.proc->nspace, info[n].value.data.proc->rank); + } else { + pmix_output(0, "[%s:%d] key: %s", myproc.nspace, myproc.rank, info[n].key); + } + } + if (NULL != cbfunc) { + cbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cbdata); + } + completed = true; +} + +static void op_callbk(pmix_status_t status, + void *cbdata) +{ + pmix_output(0, "CLIENT: OP CALLBACK CALLED WITH STATUS %d", status); +} + +static void errhandler_reg_callbk (pmix_status_t status, + size_t errhandler_ref, + void *cbdata) +{ + pmix_output(0, "Client: ERRHANDLER REGISTRATION CALLBACK CALLED WITH STATUS %d, ref=%lu", + status, (unsigned long)errhandler_ref); +} + +int main(int argc, char **argv) +{ + int rc; + pmix_value_t value; + pmix_value_t *val = &value; + pmix_proc_t proc; + uint32_t nprocs; + + /* init us */ + if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) { + pmix_output(0, "Client ns %s rank %d: PMIx_Init failed: %d", myproc.nspace, myproc.rank, rc); + exit(0); + } + pmix_output(0, "Client ns %s rank %d: Running", myproc.nspace, myproc.rank); + + /* get our universe size */ + (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); + proc.rank = PMIX_RANK_WILDCARD; + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) { + pmix_output(0, "Client ns %s rank %d: PMIx_Get universe size failed: %d", myproc.nspace, myproc.rank, rc); + goto done; + } + nprocs = val->data.uint32; + PMIX_VALUE_RELEASE(val); + pmix_output(0, "Client %s:%d universe size %d", myproc.nspace, myproc.rank, nprocs); + completed = false; + + /* register our errhandler */ + PMIx_Register_event_handler(NULL, 0, NULL, 0, + notification_fn, errhandler_reg_callbk, NULL); + + /* call fence to sync */ + PMIX_PROC_CONSTRUCT(&proc); + (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); + proc.rank = PMIX_RANK_WILDCARD; + if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) { + pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc); + goto done; + } + + /* rank=0 dies */ + if (4 < nprocs) { + /* have one exit */ + if (0 == myproc.rank) { + pmix_output(0, "Client ns %s rank %d: bye-bye!", myproc.nspace, myproc.rank); + exit(1); + } else if (1 == myproc.rank) { + usleep(500000); + pmix_output(0, "Client ns %s rank %d: bye-bye!", myproc.nspace, myproc.rank); + exit(1); + } + } else if (0 == myproc.rank) { + pmix_output(0, "Client ns %s rank %d: bye-bye!", myproc.nspace, myproc.rank); + exit(1); + } + /* everyone simply waits */ + while (!completed) { + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = 100000; + nanosleep(&ts, NULL); + } + + done: + /* finalize us */ + pmix_output(0, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank); + PMIx_Deregister_event_handler(1, op_callbk, NULL); + + if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) { + fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc); + } else { + fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank); + } + fflush(stderr); + return(0); +} From b910d55a772a1c0c6fc6f4dafa8e4eaa3c53d2df Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 28 May 2017 20:43:56 -0700 Subject: [PATCH 20/29] Update PMIx to 2.0.0rc1 - correcting incomplete cherry-pick Signed-off-by: Ralph Castain --- opal/mca/pmix/pmix2x/pmix/VERSION | 6 +- opal/mca/pmix/pmix2x/pmix/autogen.pl | 6 +- opal/mca/pmix/pmix2x/pmix/config/pmix.m4 | 48 +- .../pmix/config/pmix_check_pthread_pids.m4 | 109 ++ .../pmix2x/pmix/config/pmix_config_asm.m4 | 1307 +++++++++++++++++ .../pmix/config/pmix_config_pthreads.m4 | 669 +++++++++ .../pmix2x/pmix/config/pmix_config_threads.m4 | 71 + .../pmix2x/pmix/config/pmix_try_assemble.m4 | 52 + opal/mca/pmix/pmix2x/pmix/include/Makefile.am | 2 +- .../pmix/pmix2x/pmix/include/pmix_common.h | 5 + opal/mca/pmix/pmix2x/pmix/src/Makefile.am | 5 +- .../pmix2x/pmix/src/atomics/asm/Makefile.am | 92 ++ .../pmix2x/pmix/src/atomics/asm/asm-data.txt | 133 ++ .../pmix/pmix2x/pmix/src/atomics/asm/asm.c | 75 + .../pmix2x/pmix/src/atomics/asm/base/ARM.asm | 153 ++ .../pmix2x/pmix/src/atomics/asm/base/IA32.asm | 110 ++ .../pmix2x/pmix/src/atomics/asm/base/IA64.asm | 109 ++ .../pmix2x/pmix/src/atomics/asm/base/MIPS.asm | 196 +++ .../pmix/src/atomics/asm/base/POWERPC32.asm | 168 +++ .../pmix/src/atomics/asm/base/POWERPC64.asm | 157 ++ .../pmix/src/atomics/asm/base/SPARCV9_32.asm | 171 +++ .../pmix/src/atomics/asm/base/SPARCV9_64.asm | 111 ++ .../pmix/src/atomics/asm/base/X86_64.asm | 52 + .../pmix2x/pmix/src/atomics/asm/base/aix.conf | 44 + .../pmix/src/atomics/asm/base/default.conf | 34 + .../pmix/src/atomics/asm/generate-all-asm.pl | 27 + .../pmix/src/atomics/asm/generate-asm.pl | 123 ++ .../asm/generated/atomic-ia32-cygwin-nongas.s | 109 ++ .../asm/generated/atomic-ia32-cygwin.s | 111 ++ .../asm/generated/atomic-ia32-linux-nongas.s | 125 ++ .../atomics/asm/generated/atomic-ia32-linux.s | 127 ++ .../atomics/asm/generated/atomic-ia32-osx.s | 109 ++ .../asm/generated/atomic-ia64-linux-nongas.s | 108 ++ .../atomics/asm/generated/atomic-ia64-linux.s | 110 ++ .../atomics/asm/generated/atomic-mips-irix.s | 195 +++ .../atomics/asm/generated/atomic-mips-linux.s | 197 +++ .../asm/generated/atomic-mips64-linux.s | 197 +++ .../atomics/asm/generated/atomic-mips64el.s | 195 +++ .../asm/generated/atomic-powerpc32-64-osx.s | 165 +++ .../asm/generated/atomic-powerpc32-aix.s | 156 ++ .../generated/atomic-powerpc32-linux-nongas.s | 118 ++ .../asm/generated/atomic-powerpc32-linux.s | 120 ++ .../asm/generated/atomic-powerpc32-osx.s | 100 ++ .../asm/generated/atomic-powerpc64-aix.s | 230 +++ .../generated/atomic-powerpc64-linux-nongas.s | 180 +++ .../asm/generated/atomic-powerpc64-linux.s | 182 +++ .../asm/generated/atomic-powerpc64-osx.s | 156 ++ .../asm/generated/atomic-sparcv9-32-solaris.s | 190 +++ .../asm/generated/atomic-sparcv9-64-solaris.s | 130 ++ .../generated/atomic-x86_64-linux-nongas.s | 63 + .../asm/generated/atomic-x86_64-linux.s | 65 + .../pmix/src/atomics/sys/Makefile.include | 44 + .../pmix/src/atomics/sys/architecture.h | 57 + .../pmix/src/atomics/sys/arm/Makefile.include | 24 + .../pmix2x/pmix/src/atomics/sys/arm/atomic.h | 277 ++++ .../pmix2x/pmix/src/atomics/sys/arm/timer.h | 34 + .../src/atomics/sys/arm64/Makefile.include | 24 + .../pmix/src/atomics/sys/arm64/atomic.h | 302 ++++ .../pmix2x/pmix/src/atomics/sys/arm64/timer.h | 46 + .../pmix/pmix2x/pmix/src/atomics/sys/atomic.h | 623 ++++++++ .../pmix2x/pmix/src/atomics/sys/atomic_impl.h | 439 ++++++ .../pmix/pmix2x/pmix/src/atomics/sys/cma.h | 125 ++ .../atomics/sys/gcc_builtin/Makefile.include | 26 + .../pmix/src/atomics/sys/gcc_builtin/atomic.h | 229 +++ .../src/atomics/sys/ia32/Makefile.include | 24 + .../pmix2x/pmix/src/atomics/sys/ia32/atomic.h | 223 +++ .../pmix2x/pmix/src/atomics/sys/ia32/timer.h | 59 + .../src/atomics/sys/ia64/Makefile.include | 24 + .../pmix2x/pmix/src/atomics/sys/ia64/atomic.h | 146 ++ .../pmix2x/pmix/src/atomics/sys/ia64/timer.h | 49 + .../src/atomics/sys/mips/Makefile.include | 24 + .../pmix2x/pmix/src/atomics/sys/mips/atomic.h | 199 +++ .../pmix2x/pmix/src/atomics/sys/mips/timer.h | 34 + .../src/atomics/sys/powerpc/Makefile.include | 24 + .../pmix/src/atomics/sys/powerpc/atomic.h | 464 ++++++ .../pmix/src/atomics/sys/powerpc/timer.h | 53 + .../src/atomics/sys/sparcv9/Makefile.include | 24 + .../pmix/src/atomics/sys/sparcv9/atomic.h | 198 +++ .../pmix/src/atomics/sys/sparcv9/timer.h | 68 + .../atomics/sys/sync_builtin/Makefile.include | 24 + .../src/atomics/sys/sync_builtin/atomic.h | 137 ++ .../pmix/pmix2x/pmix/src/atomics/sys/timer.h | 131 ++ .../src/atomics/sys/x86_64/Makefile.include | 26 + .../pmix/src/atomics/sys/x86_64/atomic.h | 281 ++++ .../pmix/src/atomics/sys/x86_64/timer.h | 75 + .../pmix2x/pmix/src/buffer_ops/open_close.c | 32 +- .../pmix/pmix2x/pmix/src/buffer_ops/pack.c | 5 + .../pmix/pmix2x/pmix/src/buffer_ops/unpack.c | 7 +- .../pmix/src/class/pmix_pointer_array.c | 281 ++-- .../pmix/src/class/pmix_pointer_array.h | 56 +- .../pmix2x/pmix/src/client/Makefile.include | 2 +- .../pmix/pmix2x/pmix/src/client/pmix_client.c | 131 +- .../pmix2x/pmix/src/client/pmix_client_ops.h | 5 +- .../pmix/pmix2x/pmix/src/dstore/pmix_esh.c | 8 +- .../pmix/src/event/pmix_event_notification.c | 405 +++-- .../pmix/src/event/pmix_event_registration.c | 77 +- .../pmix2x/pmix/src/include/pmix_globals.c | 13 +- .../pmix2x/pmix/src/include/pmix_globals.h | 21 +- .../pmix2x/pmix/src/include/pmix_stdint.h | 250 +--- .../pmix/src/mca/ptl/base/ptl_base_sendrecv.c | 21 +- .../pmix2x/pmix/src/runtime/pmix_finalize.c | 2 + .../pmix/pmix2x/pmix/src/runtime/pmix_init.c | 9 +- .../pmix/src/runtime/pmix_progress_threads.c | 43 +- .../pmix/pmix2x/pmix/src/server/pmix_server.c | 7 +- .../pmix2x/pmix/src/server/pmix_server_ops.c | 4 +- .../pmix2x/pmix/src/server/pmix_server_ops.h | 1 - .../pmix2x/pmix/src/threads/Makefile.include | 40 + .../pmix/pmix2x/pmix/src/threads/condition.c | 39 + .../pmix/pmix2x/pmix/src/threads/condition.h | 78 + opal/mca/pmix/pmix2x/pmix/src/threads/mutex.c | 94 ++ opal/mca/pmix/pmix2x/pmix/src/threads/mutex.h | 103 ++ .../pmix/pmix2x/pmix/src/threads/mutex_unix.h | 215 +++ .../mca/pmix/pmix2x/pmix/src/threads/thread.c | 134 ++ .../pmix2x/pmix/src/threads/thread_usage.h | 109 ++ .../pmix/pmix2x/pmix/src/threads/threads.h | 128 ++ opal/mca/pmix/pmix2x/pmix/src/threads/tsd.h | 179 +++ .../pmix/pmix2x/pmix/src/threads/wait_sync.c | 102 ++ .../pmix/pmix2x/pmix/src/threads/wait_sync.h | 118 ++ opal/mca/pmix/pmix2x/pmix/src/util/error.c | 2 + opal/mca/pmix/pmix2x/pmix/test/Makefile.am | 4 +- .../pmix/pmix2x/pmix/test/simple/Makefile.am | 10 +- .../pmix/pmix2x/pmix/test/simple/simptest.c | 118 +- 122 files changed, 14301 insertions(+), 562 deletions(-) create mode 100644 opal/mca/pmix/pmix2x/pmix/config/pmix_check_pthread_pids.m4 create mode 100644 opal/mca/pmix/pmix2x/pmix/config/pmix_config_asm.m4 create mode 100644 opal/mca/pmix/pmix2x/pmix/config/pmix_config_pthreads.m4 create mode 100644 opal/mca/pmix/pmix2x/pmix/config/pmix_config_threads.m4 create mode 100644 opal/mca/pmix/pmix2x/pmix/config/pmix_try_assemble.m4 create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/Makefile.am create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/asm-data.txt create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/asm.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/ARM.asm create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/IA32.asm create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/IA64.asm create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/MIPS.asm create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/POWERPC32.asm create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/POWERPC64.asm create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/SPARCV9_32.asm create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/SPARCV9_64.asm create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/X86_64.asm create mode 100755 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/aix.conf create mode 100755 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/default.conf create mode 100755 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generate-all-asm.pl create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generate-asm.pl create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-cygwin-nongas.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-cygwin.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-linux-nongas.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-linux.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-osx.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia64-linux-nongas.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia64-linux.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips-irix.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips-linux.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips64-linux.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips64el.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-64-osx.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-aix.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-linux-nongas.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-linux.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-osx.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-aix.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-linux-nongas.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-linux.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-osx.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-sparcv9-32-solaris.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-sparcv9-64-solaris.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-x86_64-linux-nongas.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-x86_64-linux.s create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/architecture.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/atomic_impl.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/cma.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/gcc_builtin/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/gcc_builtin/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sync_builtin/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sync_builtin/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/atomic.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/timer.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/Makefile.include create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/condition.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/condition.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/mutex.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/mutex.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/mutex_unix.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/thread.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/thread_usage.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/threads.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/tsd.h create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/wait_sync.c create mode 100644 opal/mca/pmix/pmix2x/pmix/src/threads/wait_sync.h diff --git a/opal/mca/pmix/pmix2x/pmix/VERSION b/opal/mca/pmix/pmix2x/pmix/VERSION index 4de2c58255e..727df5f26ac 100644 --- a/opal/mca/pmix/pmix2x/pmix/VERSION +++ b/opal/mca/pmix/pmix2x/pmix/VERSION @@ -23,14 +23,14 @@ release=0 # The only requirement is that it must be entirely printable ASCII # characters and have no white space. -greek=a1 +greek= # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=git223d70e +repo_rev=git1ce71dd # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Nov 09, 2016" +date="May 28, 2017" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix2x/pmix/autogen.pl b/opal/mca/pmix/pmix2x/pmix/autogen.pl index 8ca33503628..e8aa569bc94 100755 --- a/opal/mca/pmix/pmix2x/pmix/autogen.pl +++ b/opal/mca/pmix/pmix2x/pmix/autogen.pl @@ -4,7 +4,7 @@ # Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2013 Mellanox Technologies, Inc. # All rights reserved. -# Copyright (c) 2013-2016 Intel, Inc. All rights reserved. +# Copyright (c) 2013-2017 Intel, Inc. All rights reserved. # Copyright (c) 2015 Research Organization for Information Science # and Technology (RIST). All rights reserved. # Copyright (c) 2015 IBM Corporation. All rights reserved. @@ -55,9 +55,9 @@ my $exclude_list; # Minimum versions -my $pmix_automake_version = "1.12.2"; +my $pmix_automake_version = "1.15.0"; my $pmix_autoconf_version = "2.69"; -my $pmix_libtool_version = "2.4.2"; +my $pmix_libtool_version = "2.4.6"; # Search paths my $pmix_autoconf_search = "autoconf"; diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix.m4 index 236a9fd9242..395b78406fd 100644 --- a/opal/mca/pmix/pmix2x/pmix/config/pmix.m4 +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix.m4 @@ -179,6 +179,8 @@ AC_DEFUN([PMIX_SETUP_CORE],[ AC_CHECK_TYPES(uint32_t) AC_CHECK_TYPES(int64_t) AC_CHECK_TYPES(uint64_t) + AC_CHECK_TYPES(__int128) + AC_CHECK_TYPES(uint128_t) AC_CHECK_TYPES(long long) AC_CHECK_TYPES(intptr_t) @@ -302,6 +304,17 @@ AC_DEFUN([PMIX_SETUP_CORE],[ PMIX_CHECK_ATTRIBUTES PMIX_CHECK_COMPILER_VERSION_ID + ################################## + # Assembler Configuration + ################################## + + pmix_show_subtitle "Assembler" + + AM_PROG_AS + AC_PATH_PROG(PERL, perl, perl) + PMIX_CONFIG_ASM + + ################################## # Header files ################################## @@ -618,6 +631,28 @@ AC_DEFUN([PMIX_SETUP_CORE],[ AC_C_BIGENDIAN PMIX_CHECK_BROKEN_QSORT + # + # Check out what thread support we have + # + PMIX_CONFIG_THREADS + + CFLAGS="$CFLAGS $THREAD_CFLAGS" + CPPFLAGS="$CPPFLAGS $THREAD_CPPFLAGS" + CXXFLAGS="$CXXFLAGS $THREAD_CXXFLAGS" + CXXCPPFLAGS="$CXXCPPFLAGS $THREAD_CXXCPPFLAGS" + LDFLAGS="$LDFLAGS $THREAD_LDFLAGS" + LIBS="$LIBS $THREAD_LIBS" + + # + # What is the local equivalent of "ln -s" + # + + AC_PROG_LN_S + + AC_PROG_GREP + AC_PROG_EGREP + + ################################## # Visibility ################################## @@ -708,6 +743,7 @@ AC_DEFUN([PMIX_SETUP_CORE],[ pmix_config_prefix[Makefile] pmix_config_prefix[config/Makefile] pmix_config_prefix[include/Makefile] + pmix_config_prefix[src/atomics/asm/Makefile] pmix_config_prefix[src/Makefile] pmix_config_prefix[src/util/keyval/Makefile] pmix_config_prefix[src/mca/base/Makefile] @@ -983,15 +1019,15 @@ fi # Install backward compatibility support for PMI-1 and PMI-2 # AC_MSG_CHECKING([if want backward compatibility for PMI-1 and PMI-2]) -AC_ARG_ENABLE(pmix-backward-compatibility, - AC_HELP_STRING([--enable-pmix-backward-compatibility], +AC_ARG_ENABLE(pmi-backward-compatibility, + AC_HELP_STRING([--enable-pmi-backward-compatibility], [enable PMIx support for PMI-1 and PMI-2 (default: enabled)])) -if test "$enable_pmix_backward_compatibility" = "no"; then +if test "$enable_pmi_backward_compatibility" = "no"; then AC_MSG_RESULT([no]) - WANT_PMIX_BACKWARD=0 + WANT_PMI_BACKWARD=0 else AC_MSG_RESULT([yes]) - WANT_PMIX_BACKWARD=1 + WANT_PMI_BACKWARD=1 fi AM_CONDITIONAL([WANT_INSTALL_HEADERS], [test $WANT_INSTALL_HEADERS -eq 1]) @@ -1009,7 +1045,7 @@ AC_DEFUN([PMIX_DO_AM_CONDITIONALS],[ AM_CONDITIONAL([WANT_DSTORE], [test "x$enable_dstore" != "xno"]) AM_CONDITIONAL([WANT_PRIMARY_HEADERS], [test "x$pmix_install_primary_headers" = "xyes"]) AM_CONDITIONAL(WANT_INSTALL_HEADERS, test "$WANT_INSTALL_HEADERS" = 1) - AM_CONDITIONAL(WANT_PMIX_BACKWARD, test "$WANT_PMIX_BACKWARD" = 1) + AM_CONDITIONAL(WANT_PMI_BACKWARD, test "$WANT_PMI_BACKWARD" = 1) ]) pmix_did_am_conditionals=yes ])dnl diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix_check_pthread_pids.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix_check_pthread_pids.m4 new file mode 100644 index 00000000000..2bf03579d82 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix_check_pthread_pids.m4 @@ -0,0 +1,109 @@ +dnl +dnl Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana +dnl University Research and Technology +dnl Corporation. All rights reserved. +dnl Copyright (c) 2004-2005 The University of Tennessee and The University +dnl of Tennessee Research Foundation. All rights +dnl reserved. +dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +dnl University of Stuttgart. All rights reserved. +dnl Copyright (c) 2004-2005 The Regents of the University of California. +dnl All rights reserved. +dnl Copyright (c) 2008-2013 Cisco Systems, Inc. All rights reserved. +dnl Copyright (c) 2017 Intel, Inc. All rights reserved. +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + +AC_DEFUN([PMIX_CHECK_PTHREAD_PIDS],[ +# +# Arguments: none +# +# Dependencies: None +# +# Sets: +# PMIX_THREADS_HAVE_DIFFERENT_PIDS (variable) +# +# Test for Linux-like threads in the system. PMIX does not support +# systems with different PIDs for threads in the same process, so error +# out if we detect that case. +# + +AC_MSG_CHECKING([if threads have different pids (pthreads on linux)]) + +PMIX_VAR_SCOPE_PUSH([tpids_CFLAGS_save tpids_CPPFLAGS_save tpids_LDFLAGS_save tpids_LIBS_save tpids_MSG]) +tpids_CFLAGS_save="$CFLAGS" +CFLAGS="$CFLAGS $THREAD_CFLAGS" +tpids_CPPFLAGS_save="$CPPFLAGS" +CPPFLAGS="$CPPFLAGS $THREAD_CPPFLAGS" +tpids_LDFLAGS_save="$LDFLAGS" +LDFLAGS="$LDFLAGS $THREAD_LDFLAGS" +tpids_LIBS_save="$LIBS" +LIBS="$LIBS $THREAD_LIBS" +AC_RUN_IFELSE([AC_LANG_SOURCE([#include +#include +#include +#include + +void *checkpid(void *arg); +int main() { + pthread_t thr; + int pid, *retval; + pid = getpid(); + pthread_create(&thr, NULL, checkpid, &pid); + pthread_join(thr, (void **) &retval); + exit(*retval); +} + +static int ret; +void *checkpid(void *arg) { + int ppid = *((int *) arg); + if (ppid == getpid()) + ret = 0; + else + ret = 1; + pthread_exit((void *) &ret); +}])], +[tpids_MSG=no PMIX_THREADS_HAVE_DIFFERENT_PIDS=0], +[tpids_MSG=yes PMIX_THREADS_HAVE_DIFFERENT_PIDS=1], +[ + # If we're cross compiling, we can't do another AC_* function here beause + # it we haven't displayed the result from the last one yet. So defer + # another test until below. + PMIX_THREADS_HAVE_DIFFERENT_PIDS= + MSG="cross compiling (need another test)"]) + +CFLAGS="$tpids_CFLAGS_save" +CPPFLAGS="$tpids_CPPFLAGS_save" +LDFLAGS="$tpids_LDFLAGS_save" +LIBS="$tpids_LIBS_save" + +AC_MSG_RESULT([$tpids_MSG]) + +AS_IF([test "x$PMIX_THREADS_HAVE_DIFFERENT_PIDS" = "x"], + [ # If we are cross-compiling, look for the symbol + # __linuxthreads_create_event, which seems to only exist in the + # Linux Threads-based pthreads implementation (i.e., the one + # that has different PIDs for each thread). We *could* switch + # on $host here and only test *linux* hosts, but this test is + # pretty unique, so why bother? Note that AC_CHECK_FUNC works + # properly in cross-compiling environments in recent-enough + # versions of Autoconf (which is one of the reasons we mandate + # recent versions in autogen!). + AC_CHECK_FUNC([__linuxthreads_create_event], + [PMIX_THREADS_HAVE_DIFFERENT_PIDS=1])]) + +AS_IF([test "$PMIX_THREADS_HAVE_DIFFERENT_PIDS" = "1"], + [AC_MSG_WARN([This version of PMIx only supports environments where]) + AC_MSG_WARN([threads have the same PID]) + AC_MSG_ERROR([Cannot continue]) + ]) + +# +# if pthreads is not available, then the system does not have an insane threads +# model +# +PMIX_VAR_SCOPE_POP])dnl diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix_config_asm.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix_config_asm.m4 new file mode 100644 index 00000000000..858e1e6309b --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix_config_asm.m4 @@ -0,0 +1,1307 @@ +dnl +dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +dnl University Research and Technology +dnl Corporation. All rights reserved. +dnl Copyright (c) 2004-2015 The University of Tennessee and The University +dnl of Tennessee Research Foundation. All rights +dnl reserved. +dnl Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, +dnl University of Stuttgart. All rights reserved. +dnl Copyright (c) 2004-2005 The Regents of the University of California. +dnl All rights reserved. +dnl Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. +dnl Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. +dnl Copyright (c) 2015-2017 Research Organization for Information Science +dnl and Technology (RIST). All rights reserved. +dnl Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights +dnl reserved. +dnl Copyright (c) 2017 Amazon.com, Inc. or its affiliates. All Rights +dnl reserved. +dnl Copyright (c) 2017 Intel, Inc. All rights reserved. +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + + +AC_DEFUN([PMIX_CHECK_SYNC_BUILTIN_CSWAP_INT128], [ + + PMIX_VAR_SCOPE_PUSH([sync_bool_compare_and_swap_128_result CFLAGS_save]) + + AC_ARG_ENABLE([cross-cmpset128],[AC_HELP_STRING([--enable-cross-cmpset128], + [enable the use of the __sync builtin atomic compare-and-swap 128 when cross compiling])]) + + sync_bool_compare_and_swap_128_result=0 + + if test ! "$enable_cross_cmpset128" = "yes" ; then + AC_MSG_CHECKING([for processor support of __sync builtin atomic compare-and-swap on 128-bit values]) + + AC_RUN_IFELSE([AC_LANG_PROGRAM([], [__int128 x = 0; __sync_bool_compare_and_swap (&x, 0, 1);])], + [AC_MSG_RESULT([yes]) + sync_bool_compare_and_swap_128_result=1], + [AC_MSG_RESULT([no])], + [AC_MSG_RESULT([no (cross compiling)])]) + + if test $sync_bool_compare_and_swap_128_result = 0 ; then + CFLAGS_save=$CFLAGS + CFLAGS="$CFLAGS -mcx16" + + AC_MSG_CHECKING([for __sync builtin atomic compare-and-swap on 128-bit values with -mcx16 flag]) + AC_RUN_IFELSE([AC_LANG_PROGRAM([], [__int128 x = 0; __sync_bool_compare_and_swap (&x, 0, 1);])], + [AC_MSG_RESULT([yes]) + sync_bool_compare_and_swap_128_result=1 + CFLAGS_save="$CFLAGS"], + [AC_MSG_RESULT([no])], + [AC_MSG_RESULT([no (cross compiling)])]) + + CFLAGS=$CFLAGS_save + fi + else + AC_MSG_CHECKING([for compiler support of __sync builtin atomic compare-and-swap on 128-bit values]) + + # Check if the compiler supports the __sync builtin + AC_TRY_LINK([], [__int128 x = 0; __sync_bool_compare_and_swap (&x, 0, 1);], + [AC_MSG_RESULT([yes]) + sync_bool_compare_and_swap_128_result=1], + [AC_MSG_RESULT([no])]) + + if test $sync_bool_compare_and_swap_128_result = 0 ; then + CFLAGS_save=$CFLAGS + CFLAGS="$CFLAGS -mcx16" + + AC_MSG_CHECKING([for __sync builtin atomic compare-and-swap on 128-bit values with -mcx16 flag]) + AC_TRY_LINK([], [__int128 x = 0; __sync_bool_compare_and_swap (&x, 0, 1);], + [AC_MSG_RESULT([yes]) + sync_bool_compare_and_swap_128_result=1 + CFLAGS_save="$CFLAGS"], + [AC_MSG_RESULT([no])]) + + CFLAGS=$CFLAGS_save + fi + fi + + AC_DEFINE_UNQUOTED([PMIX_HAVE_SYNC_BUILTIN_CSWAP_INT128], [$sync_bool_compare_and_swap_128_result], + [Whether the __sync builtin atomic compare and swap supports 128-bit values]) + + PMIX_VAR_SCOPE_POP +]) + +AC_DEFUN([PMIX_CHECK_SYNC_BUILTINS], [ + AC_MSG_CHECKING([for __sync builtin atomics]) + + AC_TRY_LINK([long tmp;], [__sync_synchronize(); +__sync_bool_compare_and_swap(&tmp, 0, 1); +__sync_add_and_fetch(&tmp, 1);], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) + + AC_MSG_CHECKING([for 64-bit __sync builtin atomics]) + + AC_TRY_LINK([ +#include +uint64_t tmp;], [ +__sync_bool_compare_and_swap(&tmp, 0, 1); +__sync_add_and_fetch(&tmp, 1);], + [AC_MSG_RESULT([yes]) + pmix_asm_sync_have_64bit=1], + [AC_MSG_RESULT([no]) + pmix_asm_sync_have_64bit=0]) + + AC_DEFINE_UNQUOTED([PMIX_ASM_SYNC_HAVE_64BIT],[$pmix_asm_sync_have_64bit], + [Whether 64-bit is supported by the __sync builtin atomics]) + + # Check for 128-bit support + PMIX_CHECK_SYNC_BUILTIN_CSWAP_INT128 +]) + + +AC_DEFUN([PMIX_CHECK_GCC_BUILTIN_CSWAP_INT128], [ + + PMIX_VAR_SCOPE_PUSH([atomic_compare_exchange_n_128_result CFLAGS_save]) + + AC_ARG_ENABLE([cross-cmpset128],[AC_HELP_STRING([--enable-cross-cmpset128], + [enable the use of the __sync builtin atomic compare-and-swap 128 when cross compiling])]) + + atomic_compare_exchange_n_128_result=0 + + if test ! "$enable_cross_cmpset128" = "yes" ; then + AC_MSG_CHECKING([for processor support of __atomic builtin atomic compare-and-swap on 128-bit values]) + + AC_RUN_IFELSE([AC_LANG_PROGRAM([], [__int128 x = 0, y = 0; __atomic_compare_exchange_n (&x, &y, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);])], + [AC_MSG_RESULT([yes]) + atomic_compare_exchange_n_128_result=1], + [AC_MSG_RESULT([no])], + [AC_MSG_RESULT([no (cross compiling)])]) + + if test $atomic_compare_exchange_n_128_result = 0 ; then + CFLAGS_save=$CFLAGS + CFLAGS="$CFLAGS -mcx16" + + AC_MSG_CHECKING([for __atomic builtin atomic compare-and-swap on 128-bit values with -mcx16 flag]) + AC_RUN_IFELSE([AC_LANG_PROGRAM([], [__int128 x = 0, y = 0; __atomic_compare_exchange_n (&x, &y, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);])], + [AC_MSG_RESULT([yes]) + atomic_compare_exchange_n_128_result=1 + CFLAGS_save="$CFLAGS"], + [AC_MSG_RESULT([no])], + [AC_MSG_RESULT([no (cross compiling)])]) + + CFLAGS=$CFLAGS_save + fi + + if test $atomic_compare_exchange_n_128_result = 1 ; then + AC_MSG_CHECKING([if __int128 atomic compare-and-swap is always lock-free]) + AC_RUN_IFELSE([AC_LANG_PROGRAM([], [if (!__atomic_always_lock_free(16, 0)) { return 1; }])], + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no]) + PMIX_CHECK_SYNC_BUILTIN_CSWAP_INT128 + atomic_compare_exchange_n_128_result=0], + [AC_MSG_RESULT([no (cross compiling)])]) + fi + else + AC_MSG_CHECKING([for compiler support of __atomic builtin atomic compare-and-swap on 128-bit values]) + + # Check if the compiler supports the __atomic builtin + AC_TRY_LINK([], [__int128 x = 0, y = 0; __atomic_compare_exchange_n (&x, &y, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);], + [AC_MSG_RESULT([yes]) + atomic_compare_exchange_n_128_result=1], + [AC_MSG_RESULT([no])]) + + if test $atomic_compare_exchange_n_128_result = 0 ; then + CFLAGS_save=$CFLAGS + CFLAGS="$CFLAGS -mcx16" + + AC_MSG_CHECKING([for __atomic builtin atomic compare-and-swap on 128-bit values with -mcx16 flag]) + AC_TRY_LINK([], [__int128 x = 0, y = 0; __atomic_compare_exchange_n (&x, &y, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);], + [AC_MSG_RESULT([yes]) + atomic_compare_exchange_n_128_result=1 + CFLAGS_save="$CFLAGS"], + [AC_MSG_RESULT([no])]) + + CFLAGS=$CFLAGS_save + fi + fi + + AC_DEFINE_UNQUOTED([PMIX_HAVE_GCC_BUILTIN_CSWAP_INT128], [$atomic_compare_exchange_n_128_result], + [Whether the __atomic builtin atomic compare and swap is lock-free on 128-bit values]) + + PMIX_VAR_SCOPE_POP +]) + +AC_DEFUN([PMIX_CHECK_GCC_ATOMIC_BUILTINS], [ + AC_MSG_CHECKING([for __atomic builtin atomics]) + + AC_TRY_LINK([long tmp, old = 0;], [__atomic_thread_fence(__ATOMIC_SEQ_CST); +__atomic_compare_exchange_n(&tmp, &old, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED); +__atomic_add_fetch(&tmp, 1, __ATOMIC_RELAXED);], + [AC_MSG_RESULT([yes]) + $1], + [AC_MSG_RESULT([no]) + $2]) + + # Check for 128-bit support + PMIX_CHECK_GCC_BUILTIN_CSWAP_INT128 +]) + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_TEXT +dnl +dnl Determine how to set current mode as text. +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_ASM_TEXT],[ + AC_MSG_CHECKING([directive for setting text section]) + pmix_cv_asm_text="" + if test "$pmix_cv_c_compiler_vendor" = "microsoft" ; then + # text section will be brought in with the rest of + # header for MS - leave blank for now + pmix_cv_asm_text="" + else + case $host in + *-aix*) + pmix_cv_asm_text=[".csect .text[PR]"] + ;; + *) + pmix_cv_asm_text=".text" + ;; + esac + fi + AC_MSG_RESULT([$pmix_cv_asm_text]) + AC_DEFINE_UNQUOTED([PMIX_ASM_TEXT], ["$pmix_cv_asm_text"], + [Assembly directive for setting text section]) + PMIX_ASM_TEXT="$pmix_cv_asm_text" + AC_SUBST(PMIX_ASM_TEXT) +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_GLOBAL +dnl +dnl Sets PMIX_ASM_GLOBAL to the value to prefix global values +dnl +dnl I'm sure if I don't have a test for this, there will be some +dnl dumb platform that uses something else +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_ASM_GLOBAL],[ + AC_MSG_CHECKING([directive for exporting symbols]) + pmix_cv_asm_global="" + if test "$pmix_cv_c_compiler_vendor" = "microsoft" ; then + pmix_cv_asm_global="PUBLIC" + else + case $host in + *) + pmix_cv_asm_global=".globl" + ;; + esac + fi + AC_MSG_RESULT([$pmix_cv_asm_global]) + AC_DEFINE_UNQUOTED([PMIX_ASM_GLOBAL], ["$pmix_cv_asm_global"], + [Assembly directive for exporting symbols]) + PMIX_ASM_GLOBAL="$pmix_cv_asm_global" + AC_SUBST(PMIX_AS_GLOBAL) +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_LSYM +dnl +dnl Sets PMIX_ASM_LSYM to the prefix value on a symbol to make it +dnl an internal label (jump target and whatnot) +dnl +dnl We look for L .L $ L$ (in that order) for something that both +dnl assembles and does not leave a label in the output of nm. Fall +dnl back to L if nothing else seems to work :/ +dnl +dnl ################################################################# + +# _PMIX_CHECK_ASM_LSYM([variable-to-set]) +# --------------------------------------- +AC_DEFUN([_PMIX_CHECK_ASM_LSYM],[ + AC_REQUIRE([AC_PROG_GREP]) + + $1="L" + + for sym in L .L $ L$ ; do + asm_result=0 + echo "configure: trying $sym" >&AC_FD_CC + PMIX_TRY_ASSEMBLE([foobar$pmix_cv_asm_label_suffix +${sym}mytestlabel$pmix_cv_asm_label_suffix], + [# ok, we succeeded at assembling. see if we can nm, + # throwing the results in a file + if $NM conftest.$OBJEXT > conftest.out 2>&AC_FD_CC ; then + if test "`$GREP mytestlabel conftest.out`" = "" ; then + # there was no symbol... looks promising to me + $1="$sym" + asm_result=1 + elif test ["`$GREP ' [Nt] .*mytestlabel' conftest.out`"] = "" ; then + # see if we have a non-global-ish symbol + # but we should see if we can do better. + $1="$sym" + fi + else + # not so much on the NM goodness :/ + echo "$NM failed. Output from NM was:" >&AC_FD_CC + cat conftest.out >&AC_FD_CC + AC_MSG_WARN([$NM could not read object file]) + fi + ]) + if test "$asm_result" = "1" ; then + break + fi + done + rm -f conftest.out + unset asm_result sym +]) + +# PMIX_CHECK_ASM_LSYM() +# --------------------- +AC_DEFUN([PMIX_CHECK_ASM_LSYM],[ + AC_REQUIRE([AC_PROG_NM]) + + AC_CACHE_CHECK([prefix for lsym labels], + [pmix_cv_asm_lsym], + [_PMIX_CHECK_ASM_LSYM([pmix_cv_asm_lsym])]) + AC_DEFINE_UNQUOTED([PMIX_ASM_LSYM], ["$pmix_cv_asm_lsym"], + [Assembly prefix for lsym labels]) + PMIX_ASM_LSYM="$pmix_cv_asm_lsym" + AC_SUBST(PMIX_ASM_LSYM) +])dnl + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_PROC +dnl +dnl Sets a cv-flag, if the compiler needs a proc/endp-definition to +dnl link with C. +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_ASM_PROC],[ + AC_CACHE_CHECK([if .proc/endp is needed], + [pmix_cv_asm_need_proc], + [pmix_cv_asm_need_proc="no" + PMIX_TRY_ASSEMBLE([ + .proc mysym +mysym: + .endp mysym], + [pmix_cv_asm_need_proc="yes"]) + rm -f conftest.out]) + + if test "$pmix_cv_asm_need_proc" = "yes" ; then + pmix_cv_asm_proc=".proc" + pmix_cv_asm_endproc=".endp" + else + pmix_cv_asm_proc="#" + pmix_cv_asm_endproc="#" + fi +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_GSYM +dnl +dnl Sets PMIX_ASM_GSYM to the prefix value on a symbol to make it +dnl a global linkable from C. Basically, an _ or not. +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_ASM_GSYM],[ + AC_CACHE_CHECK([prefix for global symbol labels], + [pmix_cv_asm_gsym], + [_PMIX_CHECK_ASM_GSYM]) + + if test "$pmix_cv_asm_gsym" = "none" ; then + AC_MSG_ERROR([Could not determine global symbol label prefix]) + fi + + AC_DEFINE_UNQUOTED([PMIX_ASM_GSYM], ["$pmix_cv_asm_gsym"], + [Assembly prefix for gsym labels]) + PMIX_ASM_GSYM="$pmix_cv_asm_gsym" + AC_SUBST(PMIX_ASM_GSYM) + +]) + +AC_DEFUN([_PMIX_CHECK_ASM_GSYM],[ + pmix_cv_asm_gsym="none" + + for sym in "_" "" "." ; do + asm_result=0 + echo "configure: trying $sym" >&AC_FD_CC +cat > conftest_c.c <&AC_FD_CC + pmix_link="$CC $CFLAGS conftest_c.$OBJEXT conftest.$OBJEXT -o conftest $LDFLAGS $LIBS > conftest.link 2>&1" + if AC_TRY_EVAL(pmix_link) ; then + # save the warnings + cat conftest.link >&AC_FD_CC + asm_result=1 + else + cat conftest.link >&AC_FD_CC + echo "configure: failed C program was: " >&AC_FD_CC + cat conftest_c.c >&AC_FD_CC + echo "configure: failed ASM program was: " >&AC_FD_CC + cat conftest.s >&AC_FD_CC + asm_result=0 + fi + else + # save output and failed program + cat conftest.cmpl >&AC_FD_CC + echo "configure: failed C program was: " >&AC_FD_CC + cat conftest.c >&AC_FD_CC + asm_result=0 + fi], + [asm_result=0]) + if test "$asm_result" = "1" ; then + pmix_cv_asm_gsym="$sym" + break + fi + done + rm -rf conftest.* +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_LABEL_SUFFIX +dnl +dnl Sets PMIX_ASM_LABEL_SUFFIX to the value to suffix for labels +dnl +dnl I'm sure if I don't have a test for this, there will be some +dnl dumb platform that uses something else +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_ASM_LABEL_SUFFIX],[ + AC_MSG_CHECKING([suffix for labels]) + pmix_cv_asm_label_suffix="" + case $host in + *) + pmix_cv_asm_label_suffix=":" + ;; + esac + AC_MSG_RESULT([$pmix_cv_asm_label_suffix]) + AC_DEFINE_UNQUOTED([PMIX_ASM_LABEL_SUFFIX], ["$pmix_cv_asm_label_suffix"], + [Assembly suffix for labels]) + PMIX_ASM_LABEL_SUFFIX="$pmix_cv_asm_label_suffix" + AC_SUBST(PMIX_AS_LABEL_SUFFIX) +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_ALIGN_LOG +dnl +dnl Sets PMIX_ASM_ALIGN_LOG to 1 if align is specified +dnl logarithmically, 0 otherwise +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_ASM_ALIGN_LOG],[ + AC_REQUIRE([AC_PROG_NM]) + AC_REQUIRE([AC_PROG_GREP]) + + AC_CACHE_CHECK([if .align directive takes logarithmic value], + [pmix_cv_asm_align_log], + [ PMIX_TRY_ASSEMBLE([ $pmix_cv_asm_text + .align 4 + $pmix_cv_asm_global foo + .byte 1 + .align 4 +foo$pmix_cv_asm_label_suffix + .byte 2], + [pmix_asm_addr=[`$NM conftest.$OBJEXT | $GREP foo | sed -e 's/.*\([0-9a-fA-F][0-9a-fA-F]\).*foo.*/\1/'`]], + [pmix_asm_addr=""]) + # test for both 16 and 10 (decimal and hex notations) + echo "configure: .align test address offset is $pmix_asm_addr" >&AC_FD_CC + if test "$pmix_asm_addr" = "16" || test "$pmix_asm_addr" = "10" ; then + pmix_cv_asm_align_log="yes" + else + pmix_cv_asm_align_log="no" + fi]) + + if test "$pmix_cv_asm_align_log" = "yes" || test "$pmix_cv_asm_align_log" = "1" ; then + pmix_asm_align_log_result=1 + else + pmix_asm_align_log_result=0 + fi + + AC_DEFINE_UNQUOTED([PMIX_ASM_ALIGN_LOG], + [$asm_align_log_result], + [Assembly align directive expects logarithmic value]) + + unset omp_asm_addr asm_result +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_TYPE +dnl +dnl Sets PMIX_ASM_TYPE to the prefix for the function type to +dnl set a symbol's type as function (needed on ELF for shared +dnl libaries). If no .type directive is needed, sets PMIX_ASM_TYPE +dnl to an empty string +dnl +dnl We look for @ \# % +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_ASM_TYPE],[ + AC_CACHE_CHECK([prefix for function in .type], + [pmix_cv_asm_type], + [_PMIX_CHECK_ASM_TYPE]) + + AC_DEFINE_UNQUOTED([PMIX_ASM_TYPE], ["$pmix_cv_asm_type"], + [How to set function type in .type directive]) + PMIX_ASM_TYPE="$pmix_cv_asm_type" + AC_SUBST(PMIX_ASM_TYPE) +]) + +AC_DEFUN([_PMIX_CHECK_ASM_TYPE],[ + pmix_cv_asm_type="" + + case "${host}" in + *-sun-solaris*) + # GCC on solaris seems to accept just about anything, not + # that what it defines actually works... So just hardwire + # to the right answer + pmix_cv_asm_type="#" + ;; + *) + for type in @ \# % ; do + asm_result=0 + echo "configure: trying $type" >&AC_FD_CC + PMIX_TRY_ASSEMBLE([ .type mysym, ${type}function +mysym:], + [pmix_cv_asm_type="${type}" + asm_result=1]) + if test "$asm_result" = "1" ; then + break + fi + done + ;; + esac + rm -f conftest.out + + unset asm_result type +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_ASM_SIZE +dnl +dnl Sets PMIX_ASM_SIZE to 1 if we should set .size directives for +dnl each function, 0 otherwise. +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_ASM_SIZE],[ + AC_CACHE_CHECK([if .size is needed], + [pmix_cv_asm_need_size], + [pmix_cv_asm_need_size="no" + PMIX_TRY_ASSEMBLE([ .size mysym, 1], + [pmix_cv_asm_need_size="yes"]) + rm -f conftest.out]) + + if test "$pmix_cv_asm_need_size" = "yes" ; then + pmix_asm_size=1 + else + pmix_asm_size=0 + fi + + AC_DEFINE_UNQUOTED([PMIX_ASM_SIZE], ["$pmix_asm_size"], + [Do we need to give a .size directive]) + PMIX_ASM_SIZE="$pmix_asm_size" + AC_SUBST(PMIX_ASM_TYPE) + unset asm_result +])dnl + + +# PMIX_CHECK_ASM_GNU_STACKEXEC(var) +# ---------------------------------- +# sets shell variable var to the things necessary to +# disable execable stacks with GAS +AC_DEFUN([PMIX_CHECK_ASM_GNU_STACKEXEC], [ + AC_REQUIRE([AC_PROG_GREP]) + + AC_CHECK_PROG([OBJDUMP], [objdump], [objdump]) + AC_CACHE_CHECK([if .note.GNU-stack is needed], + [pmix_cv_asm_gnu_stack_result], + [AS_IF([test "$OBJDUMP" != ""], + [ # first, see if a simple C program has it set + cat >conftest.c < /dev/null && pmix_cv_asm_gnu_stack_result=yes], + [PMIX_LOG_MSG([the failed program was:], 1) + PMIX_LOG_FILE([conftest.c]) + pmix_cv_asm_gnu_stack_result=no]) + if test "$pmix_cv_asm_gnu_stack_result" != "yes" ; then + pmix_cv_asm_gnu_stack_result="no" + fi + rm -rf conftest.*], + [pmix_cv_asm_gnu_stack_result="no"])]) + if test "$pmix_cv_asm_gnu_stack_result" = "yes" ; then + pmix_cv_asm_gnu_stack=1 + else + pmix_cv_asm_gnu_stack=0 + fi +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_POWERPC_REG +dnl +dnl See if the notation for specifying registers is X (most everyone) +dnl or rX (OS X) +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_POWERPC_REG],[ + AC_MSG_CHECKING([if PowerPC registers have r prefix]) + PMIX_TRY_ASSEMBLE([$pmix_cv_asm_text + addi 1,1,0], + [pmix_cv_asm_powerpc_r_reg=0], + [PMIX_TRY_ASSEMBLE([$pmix_cv_asm_text + addi r1,r1,0], + [pmix_cv_asm_powerpc_r_reg=1], + [AC_MSG_ERROR([Can not determine how to use PPC registers])])]) + if test "$pmix_cv_asm_powerpc_r_reg" = "1" ; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + + AC_DEFINE_UNQUOTED([PMIX_POWERPC_R_REGISTERS], + [$pmix_cv_asm_powerpc_r_reg], + [Whether r notation is used for ppc registers]) +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_POWERPC_64BIT +dnl +dnl On some powerpc chips (the PPC970 or G5), the OS usually runs in +dnl 32 bit mode, even though the hardware can do 64bit things. If +dnl the compiler will let us, emit code for 64bit test and set type +dnl operations (on a long long). +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_POWERPC_64BIT],[ + if test "$ac_cv_sizeof_long" != "4" ; then + # this function should only be called in the 32 bit case + AC_MSG_ERROR([CHECK_POWERPC_64BIT called on 64 bit platform. Internal error.]) + fi + AC_MSG_CHECKING([for 64-bit PowerPC assembly support]) + case $host in + *-darwin*) + ppc64_result=0 + if test "$pmix_cv_asm_powerpc_r_reg" = "1" ; then + ldarx_asm=" ldarx r1,r1,r1"; + else + ldarx_asm=" ldarx 1,1,1"; + fi + PMIX_TRY_ASSEMBLE([$pmix_cv_asm_text + $ldarx_asm], + [ppc64_result=1], + [ppc64_result=0]) + ;; + *) + ppc64_result=0 + ;; + esac + + if test "$ppc64_result" = "1" ; then + AC_MSG_RESULT([yes]) + ifelse([$1],,:,[$1]) + else + AC_MSG_RESULT([no]) + ifelse([$2],,:,[$2]) + fi + + unset ppc64_result ldarx_asm +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_SPARCV8PLUS +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_SPARCV8PLUS],[ + AC_MSG_CHECKING([if have Sparc v8+/v9 support]) + sparc_result=0 + PMIX_TRY_ASSEMBLE([$pmix_cv_asm_text + casa [%o0] 0x80, %o1, %o2], + [sparc_result=1], + [sparc_result=0]) + if test "$sparc_result" = "1" ; then + AC_MSG_RESULT([yes]) + ifelse([$1],,:,[$1]) + else + AC_MSG_RESULT([no]) + ifelse([$2],,:,[$2]) + fi + + unset sparc_result +])dnl + +dnl ################################################################# +dnl +dnl PMIX_CHECK_CMPXCHG16B +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_CMPXCHG16B],[ + PMIX_VAR_SCOPE_PUSH([cmpxchg16b_result]) + + AC_ARG_ENABLE([cross-cmpxchg16b],[AC_HELP_STRING([--enable-cross-cmpxchg16b], + [enable the use of the cmpxchg16b instruction when cross compiling])]) + + if test ! "$enable_cross_cmpxchg16b" = "yes" ; then + AC_MSG_CHECKING([if processor supports x86_64 16-byte compare-and-exchange]) + AC_RUN_IFELSE([AC_LANG_PROGRAM([[unsigned char tmp[16];]],[[ + __asm__ __volatile__ ("lock cmpxchg16b (%%rsi)" : : "S" (tmp) : "memory", "cc");]])], + [AC_MSG_RESULT([yes]) + cmpxchg16b_result=1], + [AC_MSG_RESULT([no]) + cmpxchg16b_result=0], + [AC_MSG_RESULT([no (cross-compiling)]) + cmpxchg16b_result=0]) + else + AC_MSG_CHECKING([if assembler supports x86_64 16-byte compare-and-exchange]) + + PMIX_TRY_ASSEMBLE([$pmix_cv_asm_text + cmpxchg16b 0], + [AC_MSG_RESULT([yes]) + cmpxchg16b_result=1], + [AC_MSG_RESULT([no]) + cmpxchg16b_result=0]) + fi + if test "$cmpxchg16b_result" = 1; then + AC_MSG_CHECKING([if compiler correctly handles volatile 128bits]) + AC_RUN_IFELSE([AC_LANG_PROGRAM([#include +#include + +union pmix_counted_pointer_t { + struct { + uint64_t counter; + uint64_t item; + } data; +#if defined(HAVE___INT128) && HAVE___INT128 + __int128 value; +#elif defined(HAVE_INT128_T) && HAVE_INT128_T + int128_t value; +#endif +}; +typedef union pmix_counted_pointer_t pmix_counted_pointer_t;], + [volatile pmix_counted_pointer_t a; + pmix_counted_pointer_t b; + + a.data.counter = 0; + a.data.item = 0x1234567890ABCDEF; + + b.data.counter = a.data.counter; + b.data.item = a.data.item; + + /* bozo checks */ + assert(16 == sizeof(pmix_counted_pointer_t)); + assert(a.data.counter == b.data.counter); + assert(a.data.item == b.data.item); + /* + * the following test fails on buggy compilers + * so far, with icc -o conftest conftest.c + * - intel icc 14.0.0.080 (aka 2013sp1) + * - intel icc 14.0.1.106 (aka 2013sp1u1) + * older and more recents compilers work fine + * buggy compilers work also fine but only with -O0 + */ +#if (defined(HAVE___INT128) && HAVE___INT128) || (defined(HAVE_INT128_T) && HAVE_INT128_T) + return (a.value != b.value); +#else + return 0; +#endif])], + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no]) + cmpxchg16b_result=0], + [AC_MSG_RESULT([untested, assuming ok])]) + fi + AC_DEFINE_UNQUOTED([PMIX_HAVE_CMPXCHG16B], [$cmpxchg16b_result], + [Whether the processor supports the cmpxchg16b instruction]) + PMIX_VAR_SCOPE_POP +])dnl + +dnl ################################################################# +dnl +dnl PMIX_CHECK_INLINE_GCC +dnl +dnl Check if the compiler is capable of doing GCC-style inline +dnl assembly. Some compilers emit a warning and ignore the inline +dnl assembly (xlc on OS X) and compile without error. Therefore, +dnl the test attempts to run the emited code to check that the +dnl assembly is actually run. To run this test, one argument to +dnl the macro must be an assembly instruction in gcc format to move +dnl the value 0 into the register containing the variable ret. +dnl For PowerPC, this would be: +dnl +dnl "li %0,0" : "=&r"(ret) +dnl +dnl For testing ia32 assembly, the assembly instruction xaddl is +dnl tested. The xaddl instruction is used by some of the atomic +dnl implementations so it makes sense to test for it. In addition, +dnl some compilers (i.e. earlier versions of Sun Studio 12) do not +dnl necessarily handle xaddl properly, so that needs to be detected +dnl during configure time. +dnl +dnl DEFINE PMIX_GCC_INLINE_ASSEMBLY to 0 or 1 depending on GCC +dnl support +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_INLINE_C_GCC],[ + assembly="$1" + asm_result="unknown" + + AC_MSG_CHECKING([if $CC supports GCC inline assembly]) + + if test ! "$assembly" = "" ; then + AC_RUN_IFELSE([AC_LANG_PROGRAM([AC_INCLUDES_DEFAULT],[[ +int ret = 1; +int negone = -1; +__asm__ __volatile__ ($assembly); +return ret; + ]])], + [asm_result="yes"], [asm_result="no"], + [asm_result="unknown"]) + else + assembly="test skipped - assuming no" + fi + + # if we're cross compiling, just try to compile and figure good enough + if test "$asm_result" = "unknown" ; then + AC_LINK_IFELSE([AC_LANG_PROGRAM([AC_INCLUDES_DEFAULT],[[ +int ret = 1; +int negone = -1; +__asm__ __volatile__ ($assembly); +return ret; + ]])], + [asm_result="yes"], [asm_result="no"]) + fi + + AC_MSG_RESULT([$asm_result]) + + if test "$asm_result" = "yes" ; then + PMIX_C_GCC_INLINE_ASSEMBLY=1 + else + PMIX_C_GCC_INLINE_ASSEMBLY=0 + fi + + AC_DEFINE_UNQUOTED([PMIX_C_GCC_INLINE_ASSEMBLY], + [$PMIX_C_GCC_INLINE_ASSEMBLY], + [Whether C compiler supports GCC style inline assembly]) + + unset PMIX_C_GCC_INLINE_ASSEMBLY assembly asm_result +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_INLINE_DEC +dnl +dnl DEFINE PMIX_DEC to 0 or 1 depending on DEC +dnl support +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_INLINE_C_DEC],[ + + AC_MSG_CHECKING([if $CC supports DEC inline assembly]) + + AC_LINK_IFELSE([AC_LANG_PROGRAM([ +AC_INCLUDES_DEFAULT +#include ], +[[asm(""); +return 0;]])], + [asm_result="yes"], [asm_result="no"]) + + AC_MSG_RESULT([$asm_result]) + + if test "$asm_result" = "yes" ; then + PMIX_C_DEC_INLINE_ASSEMBLY=1 + else + PMIX_C_DEC_INLINE_ASSEMBLY=0 + fi + + AC_DEFINE_UNQUOTED([PMIX_C_DEC_INLINE_ASSEMBLY], + [$PMIX_C_DEC_INLINE_ASSEMBLY], + [Whether C compiler supports DEC style inline assembly]) + + unset PMIX_C_DEC_INLINE_ASSEMBLY asm_result +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CHECK_INLINE_XLC +dnl +dnl DEFINE PMIX_XLC to 0 or 1 depending on XLC +dnl support +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CHECK_INLINE_C_XLC],[ + + AC_MSG_CHECKING([if $CC supports XLC inline assembly]) + + PMIX_C_XLC_INLINE_ASSEMBLY=0 + asm_result="no" + if test "$CC" = "xlc" ; then + PMIX_XLC_INLINE_ASSEMBLY=1 + asm_result="yes" + fi + + AC_MSG_RESULT([$asm_result]) + AC_DEFINE_UNQUOTED([PMIX_C_XLC_INLINE_ASSEMBLY], + [$PMIX_C_XLC_INLINE_ASSEMBLY], + [Whether C compiler supports XLC style inline assembly]) + + unset PMIX_C_XLC_INLINE_ASSEMBLY +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_CONFIG_ASM +dnl +dnl DEFINE PMIX_ASSEMBLY_ARCH to something in sys/architecture.h +dnl DEFINE PMIX_ASSEMBLY_FORMAT to string containing correct +dnl format for assembly (not user friendly) +dnl SUBST PMIX_ASSEMBLY_FORMAT to string containing correct +dnl format for assembly (not user friendly) +dnl +dnl ################################################################# +AC_DEFUN([PMIX_CONFIG_ASM],[ + AC_REQUIRE([PMIX_SETUP_CC]) + AC_REQUIRE([AM_PROG_AS]) + + AC_ARG_ENABLE([builtin-atomics], + [AC_HELP_STRING([--enable-builtin-atomics], + [Enable use of __sync builtin atomics (default: enabled)])], + [], [enable_builtin_atomics="yes"]) + + pmix_cv_asm_builtin="BUILTIN_NO" + if test "$pmix_cv_asm_builtin" = "BUILTIN_NO" && test "$enable_builtin_atomics" = "yes" ; then + PMIX_CHECK_GCC_ATOMIC_BUILTINS([pmix_cv_asm_builtin="BUILTIN_GCC"], []) + fi + if test "$pmix_cv_asm_builtin" = "BUILTIN_NO" && test "$enable_builtin_atomics" = "yes" ; then + PMIX_CHECK_SYNC_BUILTINS([pmix_cv_asm_builtin="BUILTIN_SYNC"], []) + fi + + PMIX_CHECK_ASM_PROC + PMIX_CHECK_ASM_TEXT + PMIX_CHECK_ASM_GLOBAL + PMIX_CHECK_ASM_GNU_STACKEXEC + PMIX_CHECK_ASM_LABEL_SUFFIX + PMIX_CHECK_ASM_GSYM + PMIX_CHECK_ASM_LSYM + PMIX_CHECK_ASM_TYPE + PMIX_CHECK_ASM_SIZE + PMIX_CHECK_ASM_ALIGN_LOG + + # find our architecture for purposes of assembly stuff + pmix_cv_asm_arch="UNSUPPORTED" + PMIX_GCC_INLINE_ASSIGN="" + PMIX_ASM_SUPPORT_64BIT=0 + case "${host}" in + x86_64-*x32) + pmix_cv_asm_arch="X86_64" + PMIX_ASM_SUPPORT_64BIT=1 + PMIX_GCC_INLINE_ASSIGN='"xaddl %1,%0" : "=m"(ret), "+r"(negone) : "m"(ret)' + ;; + i?86-*|x86_64*|amd64*) + if test "$ac_cv_sizeof_long" = "4" ; then + pmix_cv_asm_arch="IA32" + else + pmix_cv_asm_arch="X86_64" + fi + PMIX_ASM_SUPPORT_64BIT=1 + PMIX_GCC_INLINE_ASSIGN='"xaddl %1,%0" : "=m"(ret), "+r"(negone) : "m"(ret)' + PMIX_CHECK_CMPXCHG16B + ;; + + ia64-*) + pmix_cv_asm_arch="IA64" + PMIX_ASM_SUPPORT_64BIT=1 + PMIX_GCC_INLINE_ASSIGN='"mov %0=r0\n;;\n" : "=&r"(ret)' + ;; + aarch64*) + pmix_cv_asm_arch="ARM64" + PMIX_ASM_SUPPORT_64BIT=1 + PMIX_ASM_ARM_VERSION=8 + AC_DEFINE_UNQUOTED([PMIX_ASM_ARM_VERSION], [$PMIX_ASM_ARM_VERSION], + [What ARM assembly version to use]) + PMIX_GCC_INLINE_ASSIGN='"mov %0, #0" : "=&r"(ret)' + ;; + + armv7*|arm-*-linux-gnueabihf) + pmix_cv_asm_arch="ARM" + PMIX_ASM_SUPPORT_64BIT=1 + PMIX_ASM_ARM_VERSION=7 + AC_DEFINE_UNQUOTED([PMIX_ASM_ARM_VERSION], [$PMIX_ASM_ARM_VERSION], + [What ARM assembly version to use]) + PMIX_GCC_INLINE_ASSIGN='"mov %0, #0" : "=&r"(ret)' + ;; + + armv6*) + pmix_cv_asm_arch="ARM" + PMIX_ASM_SUPPORT_64BIT=0 + PMIX_ASM_ARM_VERSION=6 + CCASFLAGS="$CCASFLAGS -march=armv7-a" + AC_DEFINE_UNQUOTED([PMIX_ASM_ARM_VERSION], [$PMIX_ASM_ARM_VERSION], + [What ARM assembly version to use]) + PMIX_GCC_INLINE_ASSIGN='"mov %0, #0" : "=&r"(ret)' + ;; + + armv5*linux*|armv4*linux*|arm-*-linux-gnueabi) + # uses Linux kernel helpers for some atomic operations + pmix_cv_asm_arch="ARM" + PMIX_ASM_SUPPORT_64BIT=0 + PMIX_ASM_ARM_VERSION=5 + CCASFLAGS="$CCASFLAGS -march=armv7-a" + AC_DEFINE_UNQUOTED([PMIX_ASM_ARM_VERSION], [$PMIX_ASM_ARM_VERSION], + [What ARM assembly version to use]) + PMIX_GCC_INLINE_ASSIGN='"mov %0, #0" : "=&r"(ret)' + ;; + + mips-*|mips64*) + # Should really find some way to make sure that we are on + # a MIPS III machine (r4000 and later) + pmix_cv_asm_arch="MIPS" + PMIX_ASM_SUPPORT_64BIT=1 + PMIX_GCC_INLINE_ASSIGN='"or %0,[$]0,[$]0" : "=&r"(ret)' + ;; + + powerpc-*|powerpc64-*|powerpcle-*|powerpc64le-*|rs6000-*|ppc-*) + PMIX_CHECK_POWERPC_REG + if test "$ac_cv_sizeof_long" = "4" ; then + pmix_cv_asm_arch="POWERPC32" + + # Note that on some platforms (Apple G5), even if we are + # compiling in 32 bit mode (and therefore should assume + # sizeof(long) == 4), we can use the 64 bit test and set + # operations. + PMIX_CHECK_POWERPC_64BIT(PMIX_ASM_SUPPORT_64BIT=1) + elif test "$ac_cv_sizeof_long" = "8" ; then + PMIX_ASM_SUPPORT_64BIT=1 + pmix_cv_asm_arch="POWERPC64" + else + AC_MSG_ERROR([Could not determine PowerPC word size: $ac_cv_sizeof_long]) + fi + PMIX_GCC_INLINE_ASSIGN='"1: li %0,0" : "=&r"(ret)' + ;; + + # There is no current difference between s390 and s390x + # But use two different defines in case some come later + # as s390 is 31bits while s390x is 64bits + s390-*) + pmix_cv_asm_arch="S390" + ;; + s390x-*) + pmix_cv_asm_arch="S390X" + ;; + + sparc*-*) + # SPARC v9 (and above) are the only ones with 64bit support + # if compiling 32 bit, see if we are v9 (aka v8plus) or + # earlier (casa is v8+/v9). + if test "$ac_cv_sizeof_long" = "4" ; then + have_v8plus=0 + PMIX_CHECK_SPARCV8PLUS([have_v8plus=1]) + if test "$have_v8plus" = "0" ; then + PMIX_ASM_SUPPORT_64BIT=0 + pmix_cv_asm_arch="SPARC" +AC_MSG_WARN([Sparc v8 target is not supported in this release of Open MPI.]) +AC_MSG_WARN([You must specify the target architecture v8plus to compile]) +AC_MSG_WARN([Open MPI in 32 bit mode on Sparc processors (see the README).]) +AC_MSG_ERROR([Can not continue.]) + else + PMIX_ASM_SUPPORT_64BIT=1 + pmix_cv_asm_arch="SPARCV9_32" + fi + + elif test "$ac_cv_sizeof_long" = "8" ; then + PMIX_ASM_SUPPORT_64BIT=1 + pmix_cv_asm_arch="SPARCV9_64" + else + AC_MSG_ERROR([Could not determine Sparc word size: $ac_cv_sizeof_long]) + fi + PMIX_GCC_INLINE_ASSIGN='"mov 0,%0" : "=&r"(ret)' + ;; + + *) + PMIX_CHECK_SYNC_BUILTINS([pmix_cv_asm_builtin="BUILTIN_SYNC"], + [AC_MSG_ERROR([No atomic primitives available for $host])]) + ;; + esac + + if test "x$PMIX_ASM_SUPPORT_64BIT" = "x1" && test "$pmix_cv_asm_builtin" = "BUILTIN_SYNC" && + test "$pmix_asm_sync_have_64bit" = "0" ; then + # __sync builtins exist but do not implement 64-bit support. Fall back on inline asm. + pmix_cv_asm_builtin="BUILTIN_NO" + fi + + if test "$pmix_cv_asm_builtin" = "BUILTIN_SYNC" || test "$pmix_cv_asm_builtin" = "BUILTIN_GCC" ; then + AC_DEFINE([PMIX_C_GCC_INLINE_ASSEMBLY], [1], + [Whether C compiler supports GCC style inline assembly]) + else + AC_DEFINE_UNQUOTED([PMIX_ASM_SUPPORT_64BIT], + [$PMIX_ASM_SUPPORT_64BIT], + [Whether we can do 64bit assembly operations or not. Should not be used outside of the assembly header files]) + AC_SUBST([PMIX_ASM_SUPPORT_64BIT]) + + # + # figure out if we need any special function start / stop code + # + case $host_os in + aix*) + pmix_asm_arch_config="aix" + ;; + *) + pmix_asm_arch_config="default" + ;; + esac + + # now that we know our architecture, try to inline assemble + PMIX_CHECK_INLINE_C_GCC([$PMIX_GCC_INLINE_ASSIGN]) + PMIX_CHECK_INLINE_C_DEC + PMIX_CHECK_INLINE_C_XLC + + # format: + # config_file-text-global-label_suffix-gsym-lsym-type-size-align_log-ppc_r_reg-64_bit-gnu_stack + asm_format="${pmix_asm_arch_config}" + asm_format="${asm_format}-${pmix_cv_asm_text}-${pmix_cv_asm_global}" + asm_format="${asm_format}-${pmix_cv_asm_label_suffix}-${pmix_cv_asm_gsym}" + asm_format="${asm_format}-${pmix_cv_asm_lsym}" + asm_format="${asm_format}-${pmix_cv_asm_type}-${pmix_asm_size}" + asm_format="${asm_format}-${pmix_asm_align_log_result}" + if test "$pmix_cv_asm_arch" = "POWERPC32" || test "$pmix_cv_asm_arch" = "POWERPC64" ; then + asm_format="${asm_format}-${pmix_cv_asm_powerpc_r_reg}" + else + asm_format="${asm_format}-1" + fi + asm_format="${asm_format}-${PMIX_ASM_SUPPORT_64BIT}" + pmix_cv_asm_format="${asm_format}-${pmix_cv_asm_gnu_stack}" + # For the Makefile, need to escape the $ as $$. Don't display + # this version, but make sure the Makefile gives the right thing + # when regenerating the files because the base has been touched. + PMIX_ASSEMBLY_FORMAT=`echo "$pmix_cv_asm_format" | sed -e 's/\\\$/\\\$\\\$/'` + + AC_MSG_CHECKING([for assembly format]) + AC_MSG_RESULT([$pmix_cv_asm_format]) + AC_DEFINE_UNQUOTED([PMIX_ASSEMBLY_FORMAT], ["$PMIX_ASSEMBLY_FORMAT"], + [Format of assembly file]) + AC_SUBST([PMIX_ASSEMBLY_FORMAT]) + fi # if pmix_cv_asm_builtin = BUILTIN_SYNC + + result="PMIX_$pmix_cv_asm_arch" + PMIX_ASSEMBLY_ARCH="$pmix_cv_asm_arch" + AC_MSG_CHECKING([for assembly architecture]) + AC_MSG_RESULT([$pmix_cv_asm_arch]) + AC_DEFINE_UNQUOTED([PMIX_ASSEMBLY_ARCH], [$result], + [Architecture type of assembly to use for atomic operations and CMA]) + AC_SUBST([PMIX_ASSEMBLY_ARCH]) + + # Check for RDTSCP support + result=0 + AS_IF([test "$pmix_cv_asm_arch" = "PMIX_X86_64" || test "$pmix_cv_asm_arch" = "PMIX_IA32"], + [AC_MSG_CHECKING([for RDTSCP assembly support]) + AC_LANG_PUSH([C]) + AC_TRY_RUN([[ +int main(int argc, char* argv[]) +{ + unsigned int rax, rdx; + __asm__ __volatile__ ("rdtscp\n": "=a" (rax), "=d" (rdx):: "%rax", "%rdx"); + return 0; +} + ]], + [result=1 + AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])], + [#cross compile not supported + AC_MSG_RESULT(["no (cross compiling)"])]) + AC_LANG_POP([C])]) + AC_DEFINE_UNQUOTED([PMIX_ASSEMBLY_SUPPORTS_RDTSCP], [$result], + [Whether we have support for RDTSCP instruction]) + + result="PMIX_$pmix_cv_asm_builtin" + PMIX_ASSEMBLY_BUILTIN="$pmix_cv_asm_builtin" + AC_MSG_CHECKING([for builtin atomics]) + AC_MSG_RESULT([$pmix_cv_asm_builtin]) + AC_DEFINE_UNQUOTED([PMIX_ASSEMBLY_BUILTIN], [$result], + [Whether to use builtin atomics]) + AC_SUBST([PMIX_ASSEMBLY_BUILTIN]) + + PMIX_ASM_FIND_FILE + + unset result asm_format +])dnl + + +dnl ################################################################# +dnl +dnl PMIX_ASM_FIND_FILE +dnl +dnl +dnl do all the evil mojo to provide a working assembly file +dnl +dnl ################################################################# +AC_DEFUN([PMIX_ASM_FIND_FILE], [ + AC_REQUIRE([AC_PROG_GREP]) + AC_REQUIRE([AC_PROG_FGREP]) + +if test "$pmix_cv_asm_arch" != "WINDOWS" && test "$pmix_cv_asm_builtin" != "BUILTIN_SYNC" && test "$pmix_cv_asm_builtin" != "BUILTIN_GCC" && test "$pmix_cv_asm_builtin" != "BUILTIN_OSX" ; then + # see if we have a pre-built one already + AC_MSG_CHECKING([for pre-built assembly file]) + pmix_cv_asm_file="" + if $GREP "$pmix_cv_asm_arch" "${PMIX_TOP_SRCDIR}/src/atomics/asm/asm-data.txt" | $FGREP "$pmix_cv_asm_format" >conftest.out 2>&1 ; then + pmix_cv_asm_file="`cut -f3 conftest.out`" + if test ! "$pmix_cv_asm_file" = "" ; then + pmix_cv_asm_file="atomic-${pmix_cv_asm_file}.s" + if test -f "${PMIX_TOP_SRCDIR}/src/atomics/asm/generated/${pmix_cv_asm_file}" ; then + AC_MSG_RESULT([yes ($pmix_cv_asm_file)]) + else + AC_MSG_RESULT([no ($pmix_cv_asm_file not found)]) + pmix_cv_asm_file="" + fi + fi + else + AC_MSG_RESULT([no (not in asm-data)]) + fi + rm -rf conftest.* + + if test "$pmix_cv_asm_file" = "" ; then + # Can we generate a file? + AC_MSG_CHECKING([whether possible to generate assembly file]) + mkdir -p pmix/asm/generated + pmix_cv_asm_file="atomic-local.s" + pmix_try='$PERL $PMIX_TOP_SRCDIR/src/atomics/asm/generate-asm.pl $pmix_cv_asm_arch "$pmix_cv_asm_format" $PMIX_TOP_SRCDIR/src/atomics/asm/base $PMIX_TOP_BUILDDIR/src/atomics/asm/generated/$pmix_cv_asm_file >conftest.out 2>&1' + if AC_TRY_EVAL(pmix_try) ; then + # save the warnings + cat conftest.out >&AC_FD_CC + AC_MSG_RESULT([yes]) + else + # save output + cat conftest.out >&AC_FD_CC + pmix_cv_asm_file="" + AC_MSG_RESULT([failed]) + AC_MSG_WARN([Could not build atomic operations assembly file.]) + AC_MSG_WARN([There will be no atomic operations for this build.]) + fi + fi + rm -rf conftest.* +else + # On windows with VC++, atomics are done with compiler primitives + pmix_cv_asm_file="" +fi + + AC_MSG_CHECKING([for atomic assembly filename]) + if test "$pmix_cv_asm_file" = "" ; then + AC_MSG_RESULT([none]) + result=0 + else + AC_MSG_RESULT([$pmix_cv_asm_file]) + result=1 + fi + + AC_DEFINE_UNQUOTED([PMIX_HAVE_ASM_FILE], [$result], + [Whether there is an atomic assembly file available]) + AM_CONDITIONAL([PMIX_HAVE_ASM_FILE], [test "$result" = "1"]) + + PMIX_ASM_FILE=$pmix_cv_asm_file + AC_SUBST(PMIX_ASM_FILE) +])dnl diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix_config_pthreads.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix_config_pthreads.m4 new file mode 100644 index 00000000000..2e2f1fd8f97 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix_config_pthreads.m4 @@ -0,0 +1,669 @@ +dnl +dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +dnl University Research and Technology +dnl Corporation. All rights reserved. +dnl Copyright (c) 2004-2005 The University of Tennessee and The University +dnl of Tennessee Research Foundation. All rights +dnl reserved. +dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +dnl University of Stuttgart. All rights reserved. +dnl Copyright (c) 2004-2005 The Regents of the University of California. +dnl All rights reserved. +dnl Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. +dnl Copyright (c) 2014-2017 Intel, Inc. All rights reserved. +dnl Copyright (c) 2014-2016 Research Organization for Information Science +dnl and Technology (RIST). All rights reserved. +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl +dnl PMIX_CONFIG_POSIX_THREADS() +dnl +dnl Configure posix threads, setting the following variables (but +dnl not calling AC_SUBST on them). + +# ******************************************************************** +# +# Internal macros - do not call from outside PMIX_CONFIG_POSIX_THREADS +# +# ******************************************************************** + + +AC_DEFUN([PMIX_INTL_PTHREAD_TRY_LINK], [ +# BEGIN: PMIX_INTL_PTHREAD_TRY_LINK +# +# Make sure that we can run a small application in C or C++, which +# ever is the current language. Do make sure that C or C++ is the +# current language. +# +# As long as this is not being run.... +# pthread_t may be anything from an int to a struct -- init with self-tid. +# + AC_LINK_IFELSE([AC_LANG_SOURCE([[ +#include + +int i = 3; +pthread_t me, newthread; + +void cleanup_routine(void *foo); +void *thread_main(void *foo); + +void cleanup_routine(void *foo) { i = 4; } +void *thread_main(void *foo) { i = 2; return (void*) &i; } + +int main(int argc, char* argv[]) +{ + pthread_attr_t attr; + + me = pthread_self(); + pthread_atfork(NULL, NULL, NULL); + pthread_attr_init(&attr); + pthread_cleanup_push(cleanup_routine, 0); + pthread_create(&newthread, &attr, thread_main, 0); + pthread_join(newthread, 0); + pthread_cleanup_pop(0); + + return 0; +}]])], + [$1], [$2]) +# END: PMIX_INTL_PTHREAD_TRY_LINK +])dnl + + +AC_DEFUN([PMIX_INTL_PTHREAD_TRY_LINK_FORTRAN], [ +# BEGIN: PMIX_INTL_PTHREAD_TRY_LINK_FORTRAN +# +# Make sure that we can run a small application in Fortran, with +# pthreads living in a C object file + +# Fortran module +cat > conftestf.f < conftest.c < +#include +#include +$pmix_conftest_h + +#ifdef __cplusplus +extern "C" { +#endif +int i = 3; +pthread_t me, newthread; + +void cleanup_routine(void *foo); +void *thread_main(void *foo); +void pthreadtest_f(void); + +void cleanup_routine(void *foo) { i = 4; } +void *thread_main(void *foo) { i = 2; return (void*) &i; } + +void pthreadtest_f(void) +{ + pthread_attr_t attr; + + me = pthread_self(); + pthread_atfork(NULL, NULL, NULL); + pthread_attr_init(&attr); + pthread_cleanup_push(cleanup_routine, 0); + pthread_create(&newthread, &attr, thread_main, 0); + pthread_join(newthread, 0); + pthread_cleanup_pop(0); +} + +void pthreadtest(void) +{ pthreadtest_f(); } + +void pthreadtest_(void) +{ pthreadtest_f(); } + +void pthreadtest__(void) +{ pthreadtest_f(); } + +void PTHREADTEST(void) +{ pthreadtest_f(); } + +#ifdef __cplusplus +} +#endif +EOF + +# Try the compile +PMIX_LOG_COMMAND( + [$CC $CFLAGS -I. -c conftest.c], + PMIX_LOG_COMMAND( + [$FC $FCFLAGS conftestf.f conftest.o -o conftest $LDFLAGS $LIBS], + [HAPPY=1], + [HAPPY=0]), + [HAPPY=0]) + +if test "$HAPPY" = "1"; then + $1 +else + PMIX_LOG_MSG([here is the C program:], 1) + PMIX_LOG_FILE([conftest.c]) + if test -f conftest.h; then + PMIX_LOG_MSG([here is contest.h:], 1) + PMIX_LOG_FILE([conftest.h]) + fi + PMIX_LOG_MSG([here is the fortran program:], 1) + PMIX_LOG_FILE([conftestf.f]) + $2 +fi + +unset HAPPY pmix_conftest_h +rm -rf conftest* +# END: PMIX_INTL_PTHREAD_TRY_LINK_FORTRAN +])dnl + + +# ******************************************************************** +# +# Try to compile thread support without any special flags +# +# ******************************************************************** +AC_DEFUN([PMIX_INTL_POSIX_THREADS_PLAIN_C], [ +# +# C compiler +# +if test "$pmix_pthread_c_success" = "0"; then + AC_MSG_CHECKING([if C compiler and POSIX threads work as is]) + + AC_LANG_PUSH(C) + PMIX_INTL_PTHREAD_TRY_LINK(pmix_pthread_c_success=1, + pmix_pthread_c_success=0) + AC_LANG_POP(C) + if test "$pmix_pthread_c_success" = "1"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi +fi +])dnl + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_PLAIN_CXX], [ +# +# C++ compiler +# +if test "$pmix_pthread_cxx_success" = "0"; then + AC_MSG_CHECKING([if C++ compiler and POSIX threads work as is]) + + AC_LANG_PUSH(C++) + PMIX_INTL_PTHREAD_TRY_LINK(pmix_pthread_cxx_success=1, + pmix_pthread_cxx_success=0) + AC_LANG_POP(C++) + if test "$pmix_pthread_cxx_success" = "1"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi +fi +])dnl + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_PLAIN_FC], [ +# +# Fortran compiler +# +if test "$pmix_pthread_fortran_success" = "0" && \ + test "$OMPI_TRY_FORTRAN_BINDINGS" -gt "$OMPI_FORTRAN_NO_BINDINGS" && \ + test $ompi_fortran_happy -eq 1; then + AC_MSG_CHECKING([if Fortran compiler and POSIX threads work as is]) + + AC_LANG_PUSH(C) + PMIX_INTL_PTHREAD_TRY_LINK_FORTRAN(pmix_pthread_fortran_success=1, + pmix_pthread_fortran_success=0) + AC_LANG_POP(C) + if test "$pmix_pthread_fortran_success" = "1"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi +fi +])dnl + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_PLAIN], [ +# BEGIN: PMIX_INTL_POSIX_THREADS_PLAIN +# +# Check if can compile without any special flags +# we throw -D_REENTRANT or -D_THREAD_SAFE in here, just in +# case. Some systems (OS X, for example) generally don't need +# the defines, but then will on one system header here or there +# why take chances? +# + +# Only run C++ and Fortran if those compilers already configured +AC_PROVIDE_IFELSE([AC_PROG_CC], + [PMIX_INTL_POSIX_THREADS_PLAIN_C], + [pmix_pthread_c_success=1]) + +AC_PROVIDE_IFELSE([AC_PROG_CXX], + [PMIX_INTL_POSIX_THREADS_PLAIN_CXX], + [pmix_pthread_cxx_success=1]) + +AC_PROVIDE_IFELSE([AC_PROG_FC], + [PMIX_INTL_POSIX_THREADS_PLAIN_FC], + [pmix_pthread_fortran_success=1]) + +# End: PMIX_INTL_POSIX_THREADS_PLAIN +])dnl + + +# ******************************************************************** +# +# Try to compile thread support with special compiler flags +# +# ******************************************************************** +AC_DEFUN([PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS_C], [ +# +# C compiler +# +if test "$pmix_pthread_c_success" = "0"; then + for pf in $pflags; do + AC_MSG_CHECKING([if C compiler and POSIX threads work with $pf]) + CFLAGS="$orig_CFLAGS $pf" + AC_LANG_PUSH(C) + PMIX_INTL_PTHREAD_TRY_LINK(pmix_pthread_c_success=1, + pmix_pthread_c_success=0) + AC_LANG_POP(C) + if test "$pmix_pthread_c_success" = "1"; then + PTHREAD_CFLAGS="$pf" + AC_MSG_RESULT([yes]) + break + else + PTHREAD_CFLAGS= + CFLAGS="$orig_CFLAGS" + AC_MSG_RESULT([no]) + fi + done +fi +]) + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS_CXX], [ +# +# C++ compiler +# +if test "$pmix_pthread_cxx_success" = "0"; then + for pf in $pflags; do + AC_MSG_CHECKING([if C++ compiler and POSIX threads work with $pf]) + CXXFLAGS="$orig_CXXFLAGS $pf" + AC_LANG_PUSH(C++) + PMIX_INTL_PTHREAD_TRY_LINK(pmix_pthread_cxx_success=1, + pmix_pthread_cxx_success=0) + AC_LANG_POP(C++) + if test "$pmix_pthread_cxx_success" = "1"; then + PTHREAD_CXXFLAGS="$pf" + AC_MSG_RESULT([yes]) + break + else + PTHREAD_CXXFLAGS= + CXXFLAGS="$orig_CXXFLAGS" + AC_MSG_RESULT([no]) + fi + done +fi +]) + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS_FC], [ +# +# Fortran compiler +# +if test "$pmix_pthread_fortran_success" = "0" && \ + test "$OMPI_TRY_FORTRAN_BINDINGS" -gt "$OMPI_FORTRAN_NO_BINDINGS" && \ + test $ompi_fortran_happy -eq 1; then + for pf in $pflags; do + AC_MSG_CHECKING([if Fortran compiler and POSIX threads work with $pf]) + FCFLAGS="$orig_FCFLAGS $pf" + AC_LANG_PUSH(C) + PMIX_INTL_PTHREAD_TRY_LINK_FORTRAN(pmix_pthread_fortran_success=1, + pmix_pthread_fortran_success=0) + AC_LANG_POP(C) + if test "$pmix_pthread_fortran_success" = "1"; then + PTHREAD_FCFLAGS="$pf" + AC_MSG_RESULT([yes]) + break + else + PTHREAD_FCFLAGS= + FCFLAGS="$orig_FCFLAGS" + AC_MSG_RESULT([no]) + fi + done +fi +]) + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS],[ +# Begin: PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS +# +# If above didn't work, try some super-special compiler flags +# that get evaluated to the "right" things. +# +# -Kthread: +# -kthread: FreeBSD kernel threads +# -pthread: Modern GCC (most all platforms) +# -pthreads: GCC on solaris +# -mthreads: +# -mt: Solaris native compilers / HP-UX aCC +# +# Put -mt before -mthreads because HP-UX aCC will properly compile +# with -mthreads (reading as -mt), but emit a warning about unknown +# flags hreads. Stupid compilers. + +case "${host_cpu}-${host_os}" in + *solaris*) + pflags="-pthread -pthreads -mt" + ;; + *) + pflags="-Kthread -kthread -pthread -pthreads -mt -mthreads" + ;; +esac + +# Only run C++ and Fortran if those compilers already configured +AC_PROVIDE_IFELSE([AC_PROG_CC], + [PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS_C], + [pmix_pthread_c_success=1]) + +AC_PROVIDE_IFELSE([AC_PROG_CXX], + [PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS_CXX], + [pmix_pthread_cxx_success=1]) + +AC_PROVIDE_IFELSE([AC_PROG_FC], + [PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS_FC], + [pmix_pthread_fortran_success=1]) + +# End: PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS +])dnl + + +# ******************************************************************** +# +# Try to compile thread support with extra libs +# +# ******************************************************************** +AC_DEFUN([PMIX_INTL_POSIX_THREADS_LIBS_C],[ +# +# C compiler +# +if test "$pmix_pthread_c_success" = "0"; then + for pl in $plibs; do + AC_MSG_CHECKING([if C compiler and POSIX threads work with $pl]) + case "${host_cpu}-${host-_os}" in + *-aix* | *-freebsd*) + if test "`echo $CPPFLAGS | $GREP 'D_THREAD_SAFE'`" = ""; then + PTHREAD_CPPFLAGS="-D_THREAD_SAFE" + CPPFLAGS="$CPPFLAGS $PTHREAD_CPPFLAGS" + fi + ;; + *) + if test "`echo $CPPFLAGS | $GREP 'D_REENTRANT'`" = ""; then + PTHREAD_CPPFLAGS="-D_REENTRANT" + CPPFLAGS="$CPPFLAGS $PTHREAD_CPPFLAGS" + fi + ;; + esac + LIBS="$orig_LIBS $pl" + AC_LANG_PUSH(C) + PMIX_INTL_PTHREAD_TRY_LINK(pmix_pthread_c_success=1, + pmix_pthread_c_success=0) + AC_LANG_POP(C) + if test "$pmix_pthread_c_success" = "1"; then + PTHREAD_LIBS="$pl" + AC_MSG_RESULT([yes]) + else + PTHREAD_CPPFLAGS= + CPPFLAGS="$orig_CPPFLAGS" + LIBS="$orig_LIBS" + AC_MSG_RESULT([no]) + fi + done +fi +])dnl + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_LIBS_CXX],[ +# +# C++ compiler +# +if test "$pmix_pthread_cxx_success" = "0"; then + if test ! "$pmix_pthread_c_success" = "0" && test ! "$PTHREAD_LIBS" = "" ; then + AC_MSG_CHECKING([if C++ compiler and POSIX threads work with $PTHREAD_LIBS]) + case "${host_cpu}-${host-_os}" in + *-aix* | *-freebsd*) + if test "`echo $CXXCPPFLAGS | $GREP 'D_THREAD_SAFE'`" = ""; then + PTHREAD_CXXCPPFLAGS="-D_THREAD_SAFE" + CXXCPPFLAGS="$CXXCPPFLAGS $PTHREAD_CXXCPPFLAGS" + fi + ;; + *) + if test "`echo $CXXCPPFLAGS | $GREP 'D_REENTRANT'`" = ""; then + PTHREAD_CXXCPPFLAGS="-D_REENTRANT" + CXXCPPFLAGS="$CXXCPPFLAGS $PTHREAD_CXXCPPFLAGS" + fi + ;; + esac + LIBS="$orig_LIBS $PTHREAD_LIBS" + AC_LANG_PUSH(C++) + PMIX_INTL_PTHREAD_TRY_LINK(pmix_pthread_cxx_success=1, + pmix_pthread_cxx_success=0) + AC_LANG_POP(C++) + if test "$pmix_pthread_cxx_success" = "1"; then + AC_MSG_RESULT([yes]) + else + CXXCPPFLAGS="$orig_CXXCPPFLAGS" + LIBS="$orig_LIBS" + AC_MSG_RESULT([no]) + AC_MSG_ERROR([Can not find working threads configuration. aborting]) + fi + else + for pl in $plibs; do + AC_MSG_CHECKING([if C++ compiler and POSIX threads work with $pl]) + case "${host_cpu}-${host-_os}" in + *-aix* | *-freebsd*) + if test "`echo $CXXCPPFLAGS | $GREP 'D_THREAD_SAFE'`" = ""; then + PTHREAD_CXXCPPFLAGS="-D_THREAD_SAFE" + CXXCPPFLAGS="$CXXCPPFLAGS $PTHREAD_CXXCPPFLAGS" + fi + ;; + *) + if test "`echo $CXXCPPFLAGS | $GREP 'D_REENTRANT'`" = ""; then + PTHREAD_CXXCPPFLAGS="-D_REENTRANT" + CXXCPPFLAGS="$CXXCPPFLAGS $PTHREAD_CXXCPPFLAGS" + fi + ;; + esac + LIBS="$orig_LIBS $pl" + AC_LANG_PUSH(C++) + PMIX_INTL_PTHREAD_TRY_LINK(pmix_pthread_cxx_success=1, + pmix_pthread_cxx_success=0) + AC_LANG_POP(C++) + if test "$pmix_pthread_cxx_success" = "1"; then + PTHREAD_LIBS="$pl" + AC_MSG_RESULT([yes]) + else + PTHREAD_CXXCPPFLAGS= + CXXCPPFLAGS="$orig_CXXCPPFLAGS" + LIBS="$orig_LIBS" + AC_MSG_RESULT([no]) + fi + done + fi +fi +])dnl + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_LIBS_FC],[ +# +# Fortran compiler +# +if test "$pmix_pthread_fortran_success" = "0" && \ + test "$OMPI_TRY_FORTRAN_BINDINGS" -gt "$OMPI_FORTRAN_NO_BINDINGS" && \ + test $ompi_fortran_happy -eq 1; then + if test ! "$pmix_pthread_c_success" = "0" && test ! "$PTHREAD_LIBS" = "" ; then + AC_MSG_CHECKING([if Fortran compiler and POSIX threads work with $PTHREAD_LIBS]) + LIBS="$orig_LIBS $PTHREAD_LIBS" + AC_LANG_PUSH(C) + PMIX_INTL_PTHREAD_TRY_LINK_FORTRAN(pmix_pthread_fortran_success=1, + pmix_pthread_fortran_success=0) + AC_LANG_POP(C) + if test "$pmix_pthread_fortran_success" = "1"; then + AC_MSG_RESULT([yes]) + else + LIBS="$orig_LIBS" + AC_MSG_RESULT([no]) + AC_MSG_ERROR([Can not find working threads configuration. aborting]) + fi + else + for pl in $plibs; do + AC_MSG_CHECKING([if Fortran compiler and POSIX threads work with $pl]) + LIBS="$orig_LIBS $pl" + AC_LANG_PUSH(C) + PMIX_INTL_PTHREAD_TRY_LINK_FORTRAN(pmix_pthread_fortran_success=1, + pmix_pthread_fortran_success=0) + AC_LANG_POP(C) + if test "$pmix_pthread_fortran_success" = "1"; then + PTHREAD_LIBS="$pl" + AC_MSG_RESULT([yes]) + break + else + LIBS="$orig_LIBS" + AC_MSG_RESULT([no]) + fi + done + fi +fi +])dnl + + +AC_DEFUN([PMIX_INTL_POSIX_THREADS_LIBS],[ +# Begin: PMIX_INTL_POSIX_THREADS_LIBS +# +# if we can't find a super-special compiler flags, try some libraries. +# we throw -D_REENTRANT or -D_THREAD_SAFE in here, just in case. Some +# systems (OS X, for example) generally don't need the defines, but +# then will on one system header here or there why take chances? +# +# libpthreads: AIX - must check before libpthread +# liblthread: LinuxThreads on FreeBSD +# libpthread: The usual place (like we can define usual!) +plibs="-lpthreads -llthread -lpthread" + +# Only run C++ and Fortran if those compilers already configured +AC_PROVIDE_IFELSE([AC_PROG_CC], + [PMIX_INTL_POSIX_THREADS_LIBS_C], + [pmix_pthread_c_success=1]) + +AC_PROVIDE_IFELSE([AC_PROG_CXX], + [PMIX_INTL_POSIX_THREADS_LIBS_CXX], + [pmix_pthread_cxx_success=1]) + +AC_PROVIDE_IFELSE([AC_PROG_FC], + [PMIX_INTL_POSIX_THREADS_LIBS_FC], + [pmix_pthread_fortran_success=1]) + +# End: PMIX_INTL_POSIX_THREADS_LIBS] +)dnl + + +#******************************************************************** +# +# External macro (aka, the real thing) +# +#******************************************************************** +AC_DEFUN([PMIX_CONFIG_POSIX_THREADS],[ + AC_REQUIRE([AC_PROG_GREP]) + +pmix_pthread_c_success=0 +pmix_pthread_cxx_success=0 + +orig_CFLAGS="$CFLAGS" +orig_FCFLAGS="$FCFLAGS" +orig_CXXFLAGS="$CXXFLAGS" +orig_CPPFLAGS="$CPPFLAGS" +orig_CXXCPPFLAGS="$CXXCPPFLAGS" +orig_LDFLAGS="$LDFLAGS" +orig_LIBS="$LIBS" + +PTHREAD_CFLAGS= +PTHREAD_FCFLAGS= +PTHREAD_CXXFLAGS= +PTHREAD_CPPFLAGS= +PTHREAD_CXXCPPFLAGS= +PTHREAD_LDFLAGS= +PTHREAD_LIBS= + +# Try with the basics, mam. +PMIX_INTL_POSIX_THREADS_PLAIN + +# Try the super-special compiler flags. +PMIX_INTL_POSIX_THREADS_SPECIAL_FLAGS + +# Try the normal linking methods (that's no fun) +PMIX_INTL_POSIX_THREADS_LIBS + +# +# check to see if we can create shared memory mutexes and conditions +# +AC_CHECK_FUNCS([pthread_mutexattr_setpshared pthread_condattr_setpshared]) + +# +# check to see if we can set error checking mutexes +# + +# LinuxThreads +AC_MSG_CHECKING([for PTHREAD_MUTEX_ERRORCHECK_NP]) +AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[#include ]], + [[pthread_mutexattr_settype(NULL, PTHREAD_MUTEX_ERRORCHECK_NP);]])], + [result="yes" defval=1], [result="no" defval=0]) +AC_MSG_RESULT([$result]) +AC_DEFINE_UNQUOTED([PMIX_HAVE_PTHREAD_MUTEX_ERRORCHECK_NP], [$defval], + [If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK_NP]) + +# Mac OS X +AC_MSG_CHECKING([for PTHREAD_MUTEX_ERRORCHECK]) +AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[#include ]], + [[pthread_mutexattr_settype(NULL, PTHREAD_MUTEX_ERRORCHECK);]])], + [result="yes" defval=1], [result="no" defval=0]) +AC_MSG_RESULT([$result]) +AC_DEFINE_UNQUOTED([PMIX_HAVE_PTHREAD_MUTEX_ERRORCHECK], [$defval], + [If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK]) + +CFLAGS="$orig_CFLAGS" +FCFLAGS="$orig_FCFLAGS" +CXXFLAGS="$orig_CXXFLAGS" +CPPFLAGS="$orig_CPPFLAGS" +CXXCPPFLAGS="$orig_CXXCPPFLAGS" +LDFLAGS="$orig_LDFLAGS" +LIBS="$orig_LIBS" + +if test "$pmix_pthread_c_success" = "1" && \ + test "$pmix_pthread_cxx_success" = "1"; then + internal_useless=1 + $1 +else + internal_useless=1 + $2 +fi + +unset pmix_pthread_c_success pmix_pthread_fortran_success pmix_pthread_cxx_success +unset internal_useless +])dnl diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix_config_threads.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix_config_threads.m4 new file mode 100644 index 00000000000..541e63f726c --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix_config_threads.m4 @@ -0,0 +1,71 @@ +dnl +dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana +dnl University Research and Technology +dnl Corporation. All rights reserved. +dnl Copyright (c) 2004-2005 The University of Tennessee and The University +dnl of Tennessee Research Foundation. All rights +dnl reserved. +dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +dnl University of Stuttgart. All rights reserved. +dnl Copyright (c) 2004-2005 The Regents of the University of California. +dnl All rights reserved. +dnl Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +dnl Copyright (c) 2009-2011 Oak Ridge National Labs. All rights reserved. +dnl Copyright (c) 2014-2017 Intel, Inc. All rights reserved. +dnl Copyright (c) 2015 Research Organization for Information Science +dnl and Technology (RIST). All rights reserved. +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + +AC_DEFUN([PMIX_CONFIG_THREADS],[ +# +# Arguments: none +# +# Dependencies: None +# +# Modifies: +# none - see called tests +# +# configure threads +# + +# +# Check we have POSIX threads +# +PMIX_CONFIG_POSIX_THREADS(HAVE_POSIX_THREADS=1, HAVE_POSIX_THREADS=0) +AC_MSG_CHECKING([for working POSIX threads package]) +if test "$HAVE_POSIX_THREADS" = "1" ; then + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi +export HAVE_POSIX_THREADS + +# +# Ask what threading we want (allow posix right now) +# + +if test "$HAVE_POSIX_THREADS" = "0"; then + AC_MSG_WARN(["*** POSIX threads are not"]) + AC_MSG_WARN(["*** available on your system "]) + AC_MSG_ERROR(["*** Can not continue"]) +fi + +THREAD_CFLAGS="$PTHREAD_CFLAGS" +THREAD_FCFLAGS="$PTHREAD_FCFLAGS" +THREAD_CXXFLAGS="$PTHREAD_CXXFLAGS" +THREAD_CPPFLAGS="$PTHREAD_CPPFLAGS" +THREAD_CXXCPPFLAGS="$PTHREAD_CXXCPPFLAGS" +THREAD_LDFLAGS="$PTHREAD_LDFLAGS" +THREAD_LIBS="$PTHREAD_LIBS" + +PMIX_CHECK_PTHREAD_PIDS + +AC_DEFINE_UNQUOTED([PMIX_ENABLE_MULTI_THREADS], [1], + [Whether we should enable thread support within the PMIX code base]) + +])dnl diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix_try_assemble.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix_try_assemble.m4 new file mode 100644 index 00000000000..eba8dfd629a --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix_try_assemble.m4 @@ -0,0 +1,52 @@ +dnl +dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +dnl University Research and Technology +dnl Corporation. All rights reserved. +dnl Copyright (c) 2004-2005 The University of Tennessee and The University +dnl of Tennessee Research Foundation. All rights +dnl reserved. +dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +dnl University of Stuttgart. All rights reserved. +dnl Copyright (c) 2004-2005 The Regents of the University of California. +dnl All rights reserved. +dnl Copyright (c) 2014-2017 Intel, Inc. All rights reserved. +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + +dnl PMIX_TRY_ASSEMBLE(asm-code, [action-if-success], [action-if-fail]) +dnl +dnl Attempt to assemble asm-code. If success, run action-if-success. +dnl Otherwise, run action-if-fail. Neither action-if-success nor +dnl action-if-fail are required. +dnl +dnl No preprocessing is guaranteed to be done on asm-code. Some +dnl compilers do not run the preprocessor on assembly files. +dnl +dnl On failure, asm-test.s will be included in config.out +AC_DEFUN([PMIX_TRY_ASSEMBLE], +[cat >conftest.s <&AC_FD_CC + ifelse([$2],,:,[$2]) +else + # save compiler output and failed program + cat conftest.out >&AC_FD_CC + echo "configure: failed program was:" >&AC_FD_CC + cat conftest.s >&AC_FD_CC + ifelse([$3],,:,[$3]) +fi +rm -rf conftest* +unset pmix_assemble +])dnl diff --git a/opal/mca/pmix/pmix2x/pmix/include/Makefile.am b/opal/mca/pmix/pmix2x/pmix/include/Makefile.am index 52ad624c512..35bcf6d78ca 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/include/Makefile.am @@ -17,7 +17,7 @@ include_HEADERS = \ pmix_server.h \ pmix_tool.h -if WANT_PMIX_BACKWARD +if WANT_PMI_BACKWARD include_HEADERS += \ pmi.h \ pmi2.h diff --git a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h index 74e4d7b6169..16e18e68ee7 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h +++ b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h @@ -131,6 +131,10 @@ typedef uint32_t pmix_rank_t; #define PMIX_GRPID "pmix.egid" // (uint32_t) effective group id #define PMIX_DSTPATH "pmix.dstpath" // (char*) path to dstore files #define PMIX_VERSION_INFO "pmix.version" // (char*) PMIx version of contactor +#define PMIX_PROGRAMMING_MODEL "pmix.pgm.model" // (char*) programming model being initialized (e.g., "MPI" or "OpenMP") +#define PMIX_MODEL_LIBRARY_NAME "pmix.mdl.name" // (char*) programming model implementation ID (e.g., "OpenMPI" or "MPICH") +#define PMIX_MODEL_LIBRARY_VERSION "pmix.mld.vrs" // (char*) programming model version string (e.g., "2.1.1") +#define PMIX_THREADING_MODEL "pmix.threads" // (char*) threading model used (e.g., "pthreads") /* attributes for the USOCK rendezvous socket */ @@ -531,6 +535,7 @@ typedef int pmix_status_t; #define PMIX_ERR_EVENT_REGISTRATION (PMIX_ERR_OP_BASE - 14) #define PMIX_ERR_JOB_TERMINATED (PMIX_ERR_OP_BASE - 15) #define PMIX_ERR_UPDATE_ENDPOINTS (PMIX_ERR_OP_BASE - 16) +#define PMIX_MODEL_DECLARED (PMIX_ERR_OP_BASE - 17) /* define a starting point for system error constants so * we avoid renumbering when making additions */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/Makefile.am b/opal/mca/pmix/pmix2x/pmix/src/Makefile.am index e70a8a39d58..63370390848 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/src/Makefile.am @@ -24,6 +24,7 @@ ACLOCAL_AMFLAGS = -I ./config SUBDIRS = \ + atomics/asm \ util/keyval \ mca/base \ $(MCA_pmix_FRAMEWORKS_SUBDIRS) \ @@ -32,6 +33,7 @@ SUBDIRS = \ $(MCA_pmix_FRAMEWORK_COMPONENT_DSO_SUBDIRS) DIST_SUBDIRS = \ + atomics/asm \ util/keyval \ mca/base \ $(MCA_pmix_FRAMEWORKS_SUBDIRS) \ @@ -52,7 +54,6 @@ if PMIX_EMBEDDED_MODE if WANT_INSTALL_HEADERS -# retain output of pmix library lib_LTLIBRARIES = libpmix.la libpmix_la_SOURCES = $(headers) $(sources) libpmix_la_LDFLAGS = -version-info $(libpmix_so_version) @@ -73,6 +74,8 @@ libpmix_la_LDFLAGS = -version-info $(libpmix_so_version) endif !PMIX_EMBEDDED_MODE +include atomics/sys/Makefile.include +include threads/Makefile.include include class/Makefile.include include event/Makefile.include include include/Makefile.include diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/Makefile.am b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/Makefile.am new file mode 100644 index 00000000000..4aee801de82 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/Makefile.am @@ -0,0 +1,92 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +###################################################################### +# +# This is a bit complicated. If there is anything in the library, +# it will always be atomic-asm.S. We just symlink atomic-asm.S to +# the best atomic operations available (as determined at configure +# time) +# +###################################################################### +generated/@PMIX_ASM_FILE@: base/@PMIX_ASSEMBLY_ARCH@.asm + @ if test ! -f "$(top_srcdir)/src/atomics/asm/$@" ; then \ + cmd="$(PERL) '$(top_srcdir)/src/atomics/asm/generate-asm.pl' '@PMIX_ASSEMBLY_ARCH@' '@PMIX_ASSEMBLY_FORMAT@' '$(top_srcdir)/src/atomics/asm/base' '$(top_builddir)/src/atomics/asm/generated/@PMIX_ASM_FILE@'" ; \ + echo "$$cmd" ; \ + eval $$cmd ; \ + fi + +atomic-asm.S: generated/@PMIX_ASM_FILE@ + rm -f atomic-asm.S + @ if test -f "$(top_builddir)/src/atomics/asm/generated/@PMIX_ASM_FILE@" ; then \ + cmd="ln -s \"$(top_builddir)/src/atomics/asm/generated/@PMIX_ASM_FILE@\" atomic-asm.S" ; \ + echo "$$cmd" ; \ + eval $$cmd ; \ + else \ + cmd="ln -s \"$(top_srcdir)/src/atomics/asm/generated/@PMIX_ASM_FILE@\" atomic-asm.S" ; \ + echo "$$cmd" ; \ + eval $$cmd ; \ + fi + +if PMIX_HAVE_ASM_FILE +nodist_libasm_la_SOURCES = atomic-asm.S +libasm_la_DEPENDENCIES = generated/@PMIX_ASM_FILE@ +else +nodist_libasm_la_SOURCES = +libasm_la_DEPENDENCIES = +endif + +noinst_LTLIBRARIES = libasm.la +dist_libasm_la_SOURCES = asm.c + +EXTRA_DIST = \ + asm-data.txt \ + generate-asm.pl \ + generate-all-asm.pl \ + base/aix.conf \ + base/default.conf \ + base/X86_64.asm \ + base/ARM.asm \ + base/IA32.asm \ + base/IA64.asm \ + base/MIPS.asm \ + base/POWERPC32.asm \ + base/POWERPC64.asm \ + base/SPARCV9_32.asm \ + base/SPARCV9_64.asm + +###################################################################### + +clean-local: + rm -f atomic-asm.S + +distclean-local: + rm -f generated/atomic-local.s + +###################################################################### + +# +# Copy over all the generated files +# +dist-hook: + mkdir "${distdir}/generated" + $(PERL) "$(top_srcdir)/src/atomics/asm/generate-all-asm.pl" "$(PERL)" "$(srcdir)" "$(distdir)" diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/asm-data.txt b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/asm-data.txt new file mode 100644 index 00000000000..55360354fb2 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/asm-data.txt @@ -0,0 +1,133 @@ +# -*- sh -*- +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. +# Copyright (c) 2017 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# Database for mapping architecture and assembly format to prebuilt +# assembly files. For explination of the assembly operations, see +# the inline assembly header files in src/include/sys/. +# +# FORMAT: +# ARCHITECTURE ASSEMBLY FORMAT BASE FILENAME +# +# Assembly Format field: +# config_file-text-global-label_suffix-gsym-lsym-type-size-align_log-ppc_r_reg-64_bit-gnu_stack + +###################################################################### +# +# AMD Opteron / Intel EM64T +# +###################################################################### + +X86_64 default-.text-.globl-:--.L-@-1-0-1-1-1 x86_64-linux +X86_64 default-.text-.globl-:--.L-@-1-0-1-1-0 x86_64-linux-nongas + + +###################################################################### +# +# ARM (ARMv7 and later) +# +###################################################################### + +ARM default-.text-.globl-:--.L-#-1-1-1-1-1 arm-linux + + +###################################################################### +# +# Intel Pentium Class +# +###################################################################### + +IA32 default-.text-.globl-:--.L-@-1-0-1-1-1 ia32-linux +IA32 default-.text-.globl-:--.L-@-1-0-1-1-0 ia32-linux-nongas +IA32 default-.text-.globl-:-_-L--0-1-1-1-0 ia32-osx +IA32 default-.text-.globl-:-_-L--0-0-1-1-1 ia32-cygwin +IA32 default-.text-.globl-:-_-L--0-0-1-1-0 ia32-cygwin-nongas + + +###################################################################### +# +# IA64 (Intel Itanium) +# +###################################################################### + +IA64 default-.text-.globl-:--.L-@-1-0-1-1-1 ia64-linux +IA64 default-.text-.globl-:--.L-@-1-0-1-1-0 ia64-linux-nongas + + +###################################################################### +# +# PowerPC / POWER +# +###################################################################### + +# standard ppc instruction set (AIX calls it ppc). This is not the +# true intersection of all the POWER / PowerPC machines, but works +# on PowerPCs since the 601 and on at least POWER 3 and above. +POWERPC32 default-.text-.globl-:-_-L--0-1-1-0-0 powerpc32-osx +POWERPC32 default-.text-.globl-:--.L-@-1-1-0-0-1 powerpc32-linux +POWERPC32 default-.text-.globl-:--.L-@-1-1-0-0-0 powerpc32-linux-nongas +POWERPC32 aix-.csect .text[PR]-.globl-:-.-L--0-1-0-0-0 powerpc32-aix + +# The ppc code above, plus support for the 64 bit operations. This +# mode is really only available on OS X when using the OS X 10.3 +# compiler chain with the -mcpu=970 option. +POWERPC32 default-.text-.globl-:-_-L--0-1-1-1-0 powerpc32-64-osx + +# PowerPC / POWER 64bit machines. sizeof(void*) == 8. +POWERPC64 default-.text-.globl-:-_-L--0-1-1-1-0 powerpc64-osx +POWERPC64 default-.text-.globl-:-.-.L-@-1-1-0-1-1 powerpc64-linux +POWERPC64 default-.text-.globl-:-.-.L-@-1-1-0-1-0 powerpc64-linux-nongas +POWERPC64 aix-.csect .text[PR]-.globl-:-.-L--0-1-0-1-0 powerpc64-aix + + +###################################################################### +# +# SPARC / UltraSPARC (Scalalable Processor ARChitecture) +# +###################################################################### + +# Usually compiled with -xarch=v8plus. Basically Sparc V9, but with +# sizeof(void*) == 4 instead of 8. Different from V9_64 because still +# uses 2 registers to pass in a 64bit integer +SPARCV9_32 default-.text-.globl-:--.L-#-1-0-1-1-0 sparcv9-32-solaris + +# The Sparc v9 (aka Ultra Sparc). Sizeof(void*) == 8. +SPARCV9_64 default-.text-.globl-:--.L-#-1-0-1-1-0 sparcv9-64-solaris + + +###################################################################### +# +# MIPS III (Microprocessor without Interlocked Pipeline Stages) +# R4000 and above +# +###################################################################### + +# So MIPS, in it's infinite wisdom (thank you!) decided that when +# compiling in 32bit mode and passing in a 64bit integer, it is done +# in one register (instead of SPARC and POWER, who use two). Which +# means that we can use the same code either way. Woo hoo! + +MIPS default-.text-.globl-:--L--1-1-1-1-0 mips-irix +MIPS default-.text-.globl-:--L--1-1-1-1-0 mips64el +MIPS default-.text-.globl-:--L-@-1-1-1-1-1 mips64-linux + +# However, this doesn't hold true for 32-bit MIPS as used on Linux. +MIPS default-.text-.globl-:--L-@-1-1-1-0-1 mips-linux diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/asm.c b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/asm.c new file mode 100644 index 00000000000..e2d4deabe70 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/asm.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "pmix_config.h" + +#include "src/atomics/sys/atomic.h" +#include "src/atomics/sys/architecture.h" + +#if PMIX_ASSEMBLY_ARCH == PMIX_SPARC + +#define LOCKS_TABLE_SIZE 8 +/* make sure to get into reasonably useful bits (so shift at least 5) */ +#define FIND_LOCK(addr) (&(locks_table[(((unsigned long) addr) >> 8) & \ + (LOCKS_TABLE_SIZE - 1)])) + +/* have to fix if you change LOCKS_TABLE_SIZE */ +static pmix_atomic_lock_t locks_table[LOCKS_TABLE_SIZE] = { + { { PMIX_ATOMIC_UNLOCKED } }, + { { PMIX_ATOMIC_UNLOCKED } }, + { { PMIX_ATOMIC_UNLOCKED } }, + { { PMIX_ATOMIC_UNLOCKED } }, + { { PMIX_ATOMIC_UNLOCKED } }, + { { PMIX_ATOMIC_UNLOCKED } }, + { { PMIX_ATOMIC_UNLOCKED } }, + { { PMIX_ATOMIC_UNLOCKED } } +}; + + +int32_t +pmix_atomic_add_32(volatile int32_t *addr, int delta) +{ + int32_t ret; + + pmix_atomic_lock(FIND_LOCK(addr)); + + ret = (*addr += delta); + + pmix_atomic_unlock(FIND_LOCK(addr)); + + return ret; +} + + +int32_t +pmix_atomic_sub_32(volatile int32_t *addr, int delta) +{ + int32_t ret; + + pmix_atomic_lock(FIND_LOCK(addr)); + + ret = (*addr -= delta); + + pmix_atomic_unlock(FIND_LOCK(addr)); + + return ret; +} + + +#endif /* PMIX_ASSEMBLY_ARCH == PMIX_SPARC32 */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/ARM.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/ARM.asm new file mode 100644 index 00000000000..e3720299f77 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/ARM.asm @@ -0,0 +1,153 @@ +START_FILE + TEXT + + ALIGN(4) +START_FUNC(pmix_atomic_mb) + dmb + bx lr +END_FUNC(pmix_atomic_mb) + + +START_FUNC(pmix_atomic_rmb) + dmb + bx lr +END_FUNC(pmix_atomic_rmb) + + +START_FUNC(pmix_atomic_wmb) + dmb + bx lr +END_FUNC(pmix_atomic_wmb) + + +START_FUNC(pmix_atomic_cmpset_32) + LSYM(1) + ldrex r3, [r0] + cmp r1, r3 + bne REFLSYM(2) + strex r12, r2, [r0] + cmp r12, #0 + bne REFLSYM(1) + mov r0, #1 + LSYM(2) + movne r0, #0 + bx lr +END_FUNC(pmix_atomic_cmpset_32) + + +START_FUNC(pmix_atomic_cmpset_acq_32) + LSYM(3) + ldrex r3, [r0] + cmp r1, r3 + bne REFLSYM(4) + strex r12, r2, [r0] + cmp r12, #0 + bne REFLSYM(3) + dmb + mov r0, #1 + LSYM(4) + movne r0, #0 + bx lr +END_FUNC(pmix_atomic_cmpset_acq_32) + + +START_FUNC(pmix_atomic_cmpset_rel_32) + LSYM(5) + ldrex r3, [r0] + cmp r1, r3 + bne REFLSYM(6) + dmb + strex r12, r2, [r0] + cmp r12, #0 + bne REFLSYM(4) + mov r0, #1 + LSYM(6) + movne r0, #0 + bx lr +END_FUNC(pmix_atomic_cmpset_rel_32) + +#START_64BIT +START_FUNC(pmix_atomic_cmpset_64) + push {r4-r7} + ldrd r6, r7, [sp, #16] + LSYM(7) + ldrexd r4, r5, [r0] + cmp r4, r2 + it eq + cmpeq r5, r3 + bne REFLSYM(8) + strexd r1, r6, r7, [r0] + cmp r1, #0 + bne REFLSYM(7) + mov r0, #1 + LSYM(8) + movne r0, #0 + pop {r4-r7} + bx lr +END_FUNC(pmix_atomic_cmpset_64) + +START_FUNC(pmix_atomic_cmpset_acq_64) + push {r4-r7} + ldrd r6, r7, [sp, #16] + LSYM(9) + ldrexd r4, r5, [r0] + cmp r4, r2 + it eq + cmpeq r5, r3 + bne REFLSYM(10) + strexd r1, r6, r7, [r0] + cmp r1, #0 + bne REFLSYM(9) + dmb + mov r0, #1 + LSYM(10) + movne r0, #0 + pop {r4-r7} + bx lr +END_FUNC(pmix_atomic_cmpset_acq_64) + + +START_FUNC(pmix_atomic_cmpset_rel_64) + push {r4-r7} + ldrd r6, r7, [sp, #16] + LSYM(11) + ldrexd r4, r5, [r0] + cmp r4, r2 + it eq + cmpeq r5, r3 + bne REFLSYM(12) + dmb + strexd r1, r6, r7, [r0] + cmp r1, #0 + bne REFLSYM(11) + mov r0, #1 + LSYM(12) + movne r0, #0 + pop {r4-r7} + bx lr +END_FUNC(pmix_atomic_cmpset_rel_64) +#END_64BIT + + +START_FUNC(pmix_atomic_add_32) + LSYM(13) + ldrex r2, [r0] + add r2, r2, r1 + strex r3, r2, [r0] + cmp r3, #0 + bne REFLSYM(13) + mov r0, r2 + bx lr +END_FUNC(pmix_atomic_add_32) + + +START_FUNC(pmix_atomic_sub_32) + LSYM(14) + ldrex r2, [r0] + sub r2, r2, r1 + strex r3, r2, [r0] + cmp r3, #0 + bne REFLSYM(14) + mov r0, r2 + bx lr +END_FUNC(pmix_atomic_sub_32) diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/IA32.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/IA32.asm new file mode 100644 index 00000000000..d145aa237e5 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/IA32.asm @@ -0,0 +1,110 @@ +START_FILE + TEXT + +START_FUNC(pmix_atomic_mb) + pushl %ebp + movl %esp, %ebp + leave + ret +END_FUNC(pmix_atomic_mb) + + +START_FUNC(pmix_atomic_rmb) + pushl %ebp + movl %esp, %ebp + leave + ret +END_FUNC(pmix_atomic_rmb) + + +START_FUNC(pmix_atomic_wmb) + pushl %ebp + movl %esp, %ebp + leave + ret +END_FUNC(pmix_atomic_wmb) + + +START_FUNC(pmix_atomic_cmpset_32) + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %edx + movl 16(%ebp), %ecx + movl 12(%ebp), %eax + lock; cmpxchgl %ecx,(%edx) + sete %dl + + movzbl %dl, %eax + leave + ret +END_FUNC(pmix_atomic_cmpset_32) + + +START_FUNC(pmix_atomic_cmpset_64) + pushl %ebp + movl %esp, %ebp + subl $32, %esp + movl %ebx, -12(%ebp) + movl %esi, -8(%ebp) + movl %edi, -4(%ebp) + movl 8(%ebp), %edi + movl 12(%ebp), %eax + movl 16(%ebp), %edx + movl %eax, -24(%ebp) + movl %edx, -20(%ebp) + movl 20(%ebp), %eax + movl 24(%ebp), %edx + movl %eax, -32(%ebp) + movl %edx, -28(%ebp) + movl -24(%ebp), %ebx + movl -20(%ebp), %edx + movl -32(%ebp), %esi + movl -28(%ebp), %ecx + movl %ebx, %eax + push %ebx + movl %esi, %ebx + lock; cmpxchg8b (%edi) + sete %dl + pop %ebx + + movzbl %dl, %eax + movl -12(%ebp), %ebx + movl -8(%ebp), %esi + movl -4(%ebp), %edi + movl %ebp, %esp + popl %ebp + ret +END_FUNC(pmix_atomic_cmpset_64) + + +START_FUNC(pmix_atomic_add_32) + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; addl %edx,(%eax) + movl (%eax), %eax + leave + ret +END_FUNC(pmix_atomic_add_32) + + +START_FUNC(pmix_atomic_sub_32) + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; subl %edx,(%eax) + movl (%eax), %eax + leave + ret +END_FUNC(pmix_atomic_sub_32) + + +START_FUNC(pmix_sys_timer_get_cycles) + pushl %ebp + movl %esp, %ebp + rdtsc + popl %ebp + ret +END_FUNC(pmix_sys_timer_get_cycles) diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/IA64.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/IA64.asm new file mode 100644 index 00000000000..a7287a8ffce --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/IA64.asm @@ -0,0 +1,109 @@ +START_FILE + + .pred.safe_across_calls p1-p5,p16-p63 + .text + .align 16 + .global pmix_atomic_mb# + .proc pmix_atomic_mb# +pmix_atomic_mb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_mb# + .align 16 + .global pmix_atomic_rmb# + .proc pmix_atomic_rmb# +pmix_atomic_rmb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_rmb# + .align 16 + .global pmix_atomic_wmb# + .proc pmix_atomic_wmb# +pmix_atomic_wmb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_wmb# + .align 16 + .global pmix_atomic_cmpset_acq_32# + .proc pmix_atomic_cmpset_acq_32# +pmix_atomic_cmpset_acq_32: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg4.acq r32=[r32],r34,ar.ccv + ;; + cmp4.eq p6, p7 = r32, r33 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_acq_32# + .align 16 + .global pmix_atomic_cmpset_rel_32# + .proc pmix_atomic_cmpset_rel_32# +pmix_atomic_cmpset_rel_32: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg4.rel r32=[r32],r34,ar.ccv + ;; + cmp4.eq p6, p7 = r32, r33 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_rel_32# + .align 16 + .global pmix_atomic_cmpset_acq_64# + .proc pmix_atomic_cmpset_acq_64# +pmix_atomic_cmpset_acq_64: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg8.acq r32=[r32],r34,ar.ccv + ;; + cmp.eq p6, p7 = r33, r32 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_acq_64# + .align 16 + .global pmix_atomic_cmpset_rel_64# + .proc pmix_atomic_cmpset_rel_64# +pmix_atomic_cmpset_rel_64: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg8.rel r32=[r32],r34,ar.ccv + ;; + cmp.eq p6, p7 = r33, r32 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_rel_64# + .align 16 + .global pmix_sys_timer_get_cycles# + .proc pmix_sys_timer_get_cycles# +pmix_sys_timer_get_cycles: + .prologue + .body + mov r8=ar.itc + br.ret.sptk.many b0 + ;; + .endp pmix_sys_timer_get_cycles# + .ident "GCC: (GNU) 3.2.3 20030502 (Red Hat Linux 3.2.3-49)" diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/MIPS.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/MIPS.asm new file mode 100644 index 00000000000..a30ac9f9b52 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/MIPS.asm @@ -0,0 +1,196 @@ +START_FILE + +#ifdef __linux__ +#include +#else +#include +#endif +#include + + TEXT + + ALIGN(8) +LEAF(pmix_atomic_mb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_mb) + + + ALIGN(8) +LEAF(pmix_atomic_rmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_rmb) + + +LEAF(pmix_atomic_wmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_wmb) + + +LEAF(pmix_atomic_cmpset_32) + .set noreorder +retry1: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done1 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry1 +done1: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_32) + + +LEAF(pmix_atomic_cmpset_acq_32) + .set noreorder +retry2: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done2 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry2 +done2: +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_32) + + +LEAF(pmix_atomic_cmpset_rel_32) + .set noreorder +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif +retry3: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done3 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry3 +done3: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_32) + +#ifdef __mips64 +LEAF(pmix_atomic_cmpset_64) + .set noreorder +retry4: + lld $3, 0($4) + bne $3, $5, done4 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry4 +done4: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_64) + + +LEAF(pmix_atomic_cmpset_acq_64) + .set noreorder +retry5: + lld $3, 0($4) + bne $3, $5, done5 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry5 +done5: + sync + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_64) + + +LEAF(pmix_atomic_cmpset_rel_64) + .set noreorder + sync +retry6: + lld $3, 0($4) + bne $3, $5, done6 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry6 +done6: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_64) +#endif /* __mips64 */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/POWERPC32.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/POWERPC32.asm new file mode 100644 index 00000000000..f341367806e --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/POWERPC32.asm @@ -0,0 +1,168 @@ +START_FILE + TEXT + + ALIGN(4) +START_FUNC(pmix_atomic_mb) + sync + blr +END_FUNC(pmix_atomic_mb) + + +START_FUNC(pmix_atomic_rmb) + lwsync + blr +END_FUNC(pmix_atomic_rmb) + + +START_FUNC(pmix_atomic_wmb) + eieio + blr +END_FUNC(pmix_atomic_wmb) + + +START_FUNC(pmix_atomic_cmpset_32) + LSYM(1) lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- REFLSYM(2) + stwcx. r5, 0, r3 + bne- REFLSYM(1) + LSYM(2) + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + blr +END_FUNC(pmix_atomic_cmpset_32) + + +START_FUNC(pmix_atomic_cmpset_acq_32) + LSYM(3) lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- REFLSYM(4) + stwcx. r5, 0, r3 + bne- REFLSYM(3) + sync + LSYM(4) + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + lwsync + blr +END_FUNC(pmix_atomic_cmpset_acq_32) + + +START_FUNC(pmix_atomic_cmpset_rel_32) + eieio + LSYM(5) lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- REFLSYM(6) + stwcx. r5, 0, r3 + bne- REFLSYM(5) + sync + LSYM(6) + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + blr +END_FUNC(pmix_atomic_cmpset_rel_32) + +#START_64BIT +START_FUNC(pmix_atomic_cmpset_64) + stw r4,-32(r1) + stw r5,-28(r1) + stw r6,-24(r1) + stw r7,-20(r1) + ld r5,-32(r1) + ld r7,-24(r1) + LSYM(7) ldarx r9, 0, r3 + cmpd 0, r9, r5 + bne- REFLSYM(8) + stdcx. r7, 0, r3 + bne- REFLSYM(7) + LSYM(8) + xor r3,r5,r9 + subfic r5,r3,0 + adde r3,r5,r3 + blr +END_FUNC(pmix_atomic_cmpset_64) + + +START_FUNC(pmix_atomic_cmpset_acq_64) + stw r4,-32(r1) + stw r5,-28(r1) + stw r6,-24(r1) + stw r7,-20(r1) + ld r5,-32(r1) + ld r7,-24(r1) + + LSYM(9) ldarx r9, 0, r3 + cmpd 0, r9, r5 + bne- REFLSYM(10) + stdcx. r7, 0, r3 + bne- REFLSYM(9) + LSYM(10) + xor r3,r5,r9 + subfic r5,r3,0 + adde r3,r5,r3 + blr + lwsync + blr +END_FUNC(pmix_atomic_cmpset_acq_64) + + +START_FUNC(pmix_atomic_cmpset_rel_64) + stw r4,-32(r1) + stw r5,-28(r1) + stw r6,-24(r1) + stw r7,-20(r1) + ld r5,-32(r1) + ld r7,-24(r1) + + eieio + LSYM(11) ldarx r9, 0, r3 + cmpd 0, r9, r5 + bne- REFLSYM(12) + stdcx. r7, 0, r3 + bne- REFLSYM(11) + LSYM(12) + xor r3,r5,r9 + subfic r5,r3,0 + adde r3,r5,r3 + blr + lwsync + blr +END_FUNC(pmix_atomic_cmpset_rel_64) +#END_64BIT + + +START_FUNC(pmix_atomic_add_32) + LSYM(13) lwarx r0, 0, r3 + add r0, r4, r0 + stwcx. r0, 0, r3 + bne- REFLSYM(13) + mr r3,r0 + blr +END_FUNC(pmix_atomic_add_32) + + +START_FUNC(pmix_atomic_sub_32) + LSYM(14) lwarx r0,0,r3 + subf r0,r4,r0 + stwcx. r0,0,r3 + bne- REFLSYM(14) + mr r3,r0 + blr +END_FUNC(pmix_atomic_sub_32) + +START_FUNC(pmix_sys_timer_get_cycles) + LSYM(15) + mftbu r0 + mftb r11 + mftbu r2 + cmpw cr7,r2,r0 + bne+ cr7,REFLSYM(15) + li r4,0 + li r9,0 + or r3,r2,r9 + or r4,r4,r11 + blr +END_FUNC(pmix_sys_timer_get_cycles) diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/POWERPC64.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/POWERPC64.asm new file mode 100644 index 00000000000..6fc4ad717c7 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/POWERPC64.asm @@ -0,0 +1,157 @@ +START_FILE + TEXT + + ALIGN(4) +START_FUNC(pmix_atomic_mb) + sync + blr +END_FUNC(pmix_atomic_mb) + + +START_FUNC(pmix_atomic_rmb) + lwsync + blr +END_FUNC(pmix_atomic_rmb) + + +START_FUNC(pmix_atomic_wmb) + eieio + blr +END_FUNC(pmix_atomic_wmb) + + +START_FUNC(pmix_atomic_cmpset_32) + LSYM(1) lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- REFLSYM(2) + stwcx. r5, 0, r3 + bne- REFLSYM(1) + LSYM(2) + cmpw cr7,r0,r4 + mfcr r3 + rlwinm r3,r3,31,1 + blr +END_FUNC(pmix_atomic_cmpset_32) + + +START_FUNC(pmix_atomic_cmpset_acq_32) + mflr r0 + std r29,-24(r1) + std r0,16(r1) + stdu r1,-144(r1) + bl REFGSYM(pmix_atomic_cmpset_32) + mr r29,r3 + bl REFGSYM(pmix_atomic_rmb) + mr r3,r29 + addi r1,r1,144 + ld r0,16(r1) + mtlr r0 + ld r29,-24(r1) + blr +END_FUNC(pmix_atomic_cmpset_acq_32) + + +START_FUNC(pmix_atomic_cmpset_rel_32) + mflr r0 + std r27,-40(r1) + std r28,-32(r1) + std r29,-24(r1) + std r0,16(r1) + stdu r1,-160(r1) + mr r29,r3 + mr r28,r4 + mr r27,r5 + bl REFGSYM(pmix_atomic_wmb) + mr r3,r29 + mr r4,r28 + mr r5,r27 + bl REFGSYM(pmix_atomic_cmpset_32) + addi r1,r1,160 + ld r0,16(r1) + mtlr r0 + ld r27,-40(r1) + ld r28,-32(r1) + ld r29,-24(r1) + blr +END_FUNC(pmix_atomic_cmpset_rel_32) + + +START_FUNC(pmix_atomic_cmpset_64) + LSYM(3) ldarx r0, 0, r3 + cmpd 0, r0, r4 + bne- REFLSYM(4) + stdcx. r5, 0, r3 + bne- REFLSYM(3) + LSYM(4) + xor r3,r4,r0 + subfic r5,r3,0 + adde r3,r5,r3 + blr +END_FUNC(pmix_atomic_cmpset_64) + + +START_FUNC(pmix_atomic_cmpset_acq_64) + LSYM(7) ldarx r0, 0, r3 + cmpd 0, r0, r4 + bne- REFLSYM(8) + stdcx. r5, 0, r3 + bne- REFLSYM(7) + LSYM(8) + lwsync + xor r3,r4,r0 + subfic r5,r3,0 + adde r3,r5,r3 + blr +END_FUNC(pmix_atomic_cmpset_acq_64) + + +START_FUNC(pmix_atomic_cmpset_rel_64) + eieio + LSYM(9) ldarx r0, 0, r3 + cmpd 0, r0, r4 + bne- REFLSYM(10) + stdcx. r5, 0, r3 + bne- REFLSYM(9) + LSYM(10) + xor r3,r4,r0 + subfic r5,r3,0 + adde r3,r5,r3 + blr +END_FUNC(pmix_atomic_cmpset_rel_64) + + +START_FUNC(pmix_atomic_add_32) + LSYM(5) lwarx r0, 0, r3 + add r0, r4, r0 + stwcx. r0, 0, r3 + bne- REFLSYM(5) + + mr r3,r0 + blr +END_FUNC(pmix_atomic_add_32) + + +START_FUNC(pmix_atomic_sub_32) + LSYM(6) lwarx r0,0,r3 + subf r0,r4,r0 + stwcx. r0,0,r3 + bne- REFLSYM(6) + + mr r3,r0 + blr +END_FUNC(pmix_atomic_sub_32) + +START_FUNC(pmix_sys_timer_get_cycles) + LSYM(11) + mftbu r2 + rldicl r2,r2,0,32 + mftb r0 + rldicl r9,r0,0,32 + mftbu r0 + rldicl r0,r0,0,32 + cmpw cr7,r0,r2 + bne cr7,REFLSYM(11) + sldi r3,r0,32 + or r3,r3,r9 + blr +END_FUNC(pmix_sys_timer_get_cycles) diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/SPARCV9_32.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/SPARCV9_32.asm new file mode 100644 index 00000000000..1ec34125a05 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/SPARCV9_32.asm @@ -0,0 +1,171 @@ +START_FILE + TEXT + + ALIGN(4) + + +START_FUNC(pmix_atomic_mb) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad + retl + nop +END_FUNC(pmix_atomic_mb) + + +START_FUNC(pmix_atomic_rmb) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #LoadLoad + retl + nop +END_FUNC(pmix_atomic_rmb) + + +START_FUNC(pmix_atomic_wmb) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + retl + nop +END_FUNC(pmix_atomic_wmb) + + +START_FUNC(pmix_atomic_cmpset_32) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + retl + subx %g0, -1, %o0 +END_FUNC(pmix_atomic_cmpset_32) + + +START_FUNC(pmix_atomic_cmpset_acq_32) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + subx %g0, -1, %o0 + membar #LoadLoad + retl + sra %o0, 0, %o0 +END_FUNC(pmix_atomic_cmpset_acq_32) + + +START_FUNC(pmix_atomic_cmpset_rel_32) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + retl + subx %g0, -1, %o0 +END_FUNC(pmix_atomic_cmpset_rel_32) + + +START_FUNC(pmix_atomic_cmpset_64) + !#PROLOGUE# 0 + save %sp, -128, %sp + !#PROLOGUE# 1 + mov %i3, %o4 + mov %i4, %o5 + st %i1, [%fp-32] + st %i2, [%fp-28] + std %o4, [%fp-24] + ldx [%fp-24], %g1 + ldx [%fp-32], %g2 + casxa [%i0] 0x80, %g2, %g1 + stx %g1, [%fp-24] + + ld [%fp-24], %i5 + ld [%fp-32], %g1 + cmp %i5, %g1 + bne REFLSYM(12) + mov 0, %i0 + ld [%fp-20], %i2 + ld [%fp-28], %i1 + cmp %i2, %i1 + be,a REFLSYM(12) + mov 1, %i0 +LSYM(12) + ret + restore +END_FUNC(pmix_atomic_cmpset_64) + + +START_FUNC(pmix_atomic_cmpset_acq_64) + !#PROLOGUE# 0 + save %sp, -128, %sp + !#PROLOGUE# 1 + mov %i1, %o4 + mov %i2, %o5 + mov %i3, %o2 + mov %i4, %o3 + std %o4, [%fp-32] + std %o2, [%fp-24] + ldx [%fp-24], %g1 + ldx [%fp-32], %g2 + casxa [%i0] 0x80, %g2, %g1 + stx %g1, [%fp-24] + + ld [%fp-24], %i5 + ld [%fp-32], %g1 + cmp %i5, %g1 + bne REFLSYM(16) + mov 0, %i0 + ld [%fp-20], %i2 + ld [%fp-28], %i1 + cmp %i2, %i1 + be,a REFLSYM(16) + mov 1, %i0 +LSYM(16) + membar #LoadLoad + ret + restore +END_FUNC(pmix_atomic_cmpset_acq_64) + + +START_FUNC(pmix_atomic_cmpset_rel_64) + !#PROLOGUE# 0 + save %sp, -128, %sp + !#PROLOGUE# 1 + mov %i1, %o4 + mov %i2, %o5 + mov %i3, %o2 + mov %i4, %o3 + membar #StoreStore + std %o4, [%fp-32] + std %o2, [%fp-24] + ldx [%fp-24], %g1 + ldx [%fp-32], %g2 + casxa [%i0] 0x80, %g2, %g1 + stx %g1, [%fp-24] + + ld [%fp-24], %i5 + ld [%fp-32], %g1 + cmp %i5, %g1 + bne REFLSYM(21) + mov 0, %i0 + ld [%fp-20], %i2 + ld [%fp-28], %i1 + cmp %i2, %i1 + be,a REFLSYM(21) + mov 1, %i0 +LSYM(21) + ret + restore +END_FUNC(pmix_atomic_cmpset_rel_64) + + +START_FUNC(pmix_sys_timer_get_cycles) + save %sp,-96,%sp + rd %tick,%o0 + srlx %o0,32,%o1 + or %g0,%o1,%i0 + ret ! Result = %i0 + restore %o0,0,%o1 +END_FUNC(pmix_sys_timer_get_cycles) diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/SPARCV9_64.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/SPARCV9_64.asm new file mode 100644 index 00000000000..85825577db7 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/SPARCV9_64.asm @@ -0,0 +1,111 @@ +START_FILE + TEXT + + ALIGN(4) + + +START_FUNC(pmix_atomic_mb) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad + retl + nop +END_FUNC(pmix_atomic_mb) + + +START_FUNC(pmix_atomic_rmb) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #LoadLoad + retl + nop +END_FUNC(pmix_atomic_rmb) + + +START_FUNC(pmix_atomic_wmb) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + retl + nop +END_FUNC(pmix_atomic_wmb) + + +START_FUNC(pmix_atomic_cmpset_32) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + retl + subx %g0, -1, %o0 +END_FUNC(pmix_atomic_cmpset_32) + + +START_FUNC(pmix_atomic_cmpset_acq_32) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + subx %g0, -1, %o0 + membar #LoadLoad + retl + sra %o0, 0, %o0 +END_FUNC(pmix_atomic_cmpset_acq_32) + + +START_FUNC(pmix_atomic_cmpset_rel_32) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + retl + subx %g0, -1, %o0 +END_FUNC(pmix_atomic_cmpset_rel_32) + + +START_FUNC(pmix_atomic_cmpset_64) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casxa [%o0] 0x80, %o1, %o2 + mov 0, %o0 + xor %o2, %o1, %o2 + retl + movre %o2, 1, %o0 +END_FUNC(pmix_atomic_cmpset_64) + + +START_FUNC(pmix_atomic_cmpset_acq_64) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casxa [%o0] 0x80, %o1, %o2 + mov 0, %o0 + xor %o2, %o1, %o2 + movre %o2, 1, %o0 + membar #LoadLoad + retl + sra %o0, 0, %o0 +END_FUNC(pmix_atomic_cmpset_acq_64) + + +START_FUNC(pmix_atomic_cmpset_rel_64) + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + casxa [%o0] 0x80, %o1, %o2 + mov 0, %o0 + xor %o2, %o1, %o2 + retl + movre %o2, 1, %o0 +END_FUNC(pmix_atomic_cmpset_rel_64) + + +START_FUNC(pmix_sys_timer_get_cycles) + save %sp,-176,%sp + rd %tick,%o0 + ret ! Result = %i0 + restore %o0,0,%o0 +END_FUNC(pmix_sys_timer_get_cycles) diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/X86_64.asm b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/X86_64.asm new file mode 100644 index 00000000000..042c07109ec --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/X86_64.asm @@ -0,0 +1,52 @@ +START_FILE + TEXT + +START_FUNC(pmix_atomic_mb) + pushq %rbp + movq %rsp, %rbp + leave + ret +END_FUNC(pmix_atomic_mb) + + +START_FUNC(pmix_atomic_rmb) + pushq %rbp + movq %rsp, %rbp + leave + ret +END_FUNC(pmix_atomic_rmb) + + +START_FUNC(pmix_atomic_wmb) + pushq %rbp + movq %rsp, %rbp + leave + ret +END_FUNC(pmix_atomic_wmb) + + +START_FUNC(pmix_atomic_cmpset_32) + movl %esi, %eax + lock; cmpxchgl %edx,(%rdi) + sete %dl + movzbl %dl, %eax + ret +END_FUNC(pmix_atomic_cmpset_32) + + +START_FUNC(pmix_atomic_cmpset_64) + movq %rsi, %rax + lock; cmpxchgq %rdx,(%rdi) + sete %dl + movzbl %dl, %eax + ret +END_FUNC(pmix_atomic_cmpset_64) + + +START_FUNC(pmix_sys_timer_get_cycles) + rdtsc + salq $32, %rdx + mov %eax, %eax + orq %rdx, %rax + ret +END_FUNC(pmix_sys_timer_get_cycles) diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/aix.conf b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/aix.conf new file mode 100755 index 00000000000..482aabdd418 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/aix.conf @@ -0,0 +1,44 @@ +sub start_file() +{ + my $ret = ""; + if ($IS64BIT == 1) { + $ret .= "\t.machine \"ppc64\"\n"; + } else { + $ret .= "\t.machine \"ppc\"\n"; + } + $ret .= "\t.toc\n"; + return $ret; +} + + +sub start_func($) +{ + my $func_name = shift; + my $ret = ""; + + $ret = "\t$GLOBAL $func_name\n"; + $ret .= "\t$GLOBAL $GSYM$func_name\n"; + $ret .= "\t.csect [DS],3\n"; + + $ret .= "$func_name$SUFFIX\n"; + + if ($IS64BIT == 1) { + $ret .= "\t.llong .$func_name, TOC[tc0], 0\n"; + } else { + $ret .= "\t.long .$func_name, TOC[tc0], 0\n"; + } + $ret .= "\t.csect [PR]\n"; + + $ret .= "\t.align 2\n"; + $ret .= "$GSYM$func_name$SUFFIX\n"; + + return $ret; +} + + +sub end_func($) +{ + return ""; +} + +1 diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/default.conf b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/default.conf new file mode 100755 index 00000000000..c54f085cf99 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/base/default.conf @@ -0,0 +1,34 @@ +sub start_file +{ + return ""; +} + + +sub start_func($) +{ + my $func_name = shift; + my $ret = ""; + + $ret = "\t$GLOBAL $GSYM$func_name\n"; + if (! $TYPE eq "") { + $ret .= "\t.type $GSYM$func_name, $TYPE" . "function\n"; + } + $ret .= "$GSYM$func_name$SUFFIX\n"; + + return $ret; +} + + +sub end_func($) +{ + my $func_name = shift; + my $ret = ""; + + if ($SIZE != 0) { + $ret = "\t.size $GSYM$func_name, .-$GSYM$func_name\n"; + } + + return $ret; +} + +1 diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generate-all-asm.pl b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generate-all-asm.pl new file mode 100755 index 00000000000..e452cbeaf2e --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generate-all-asm.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w + +my $perl = shift; +my $srcdir = shift; +my $destdir = shift; + +if (! $perl || ! $srcdir || ! $destdir) { + print "ERROR: invalid argument to generate-all-asm.pl\n"; + print "usage: generate-all-asm.pl [PERL] [SRCDIR] [DESTDIR]\n"; + exit 1; +} + +open(DATAFILE, "$srcdir/asm-data.txt") || die "Could not open data file: $!\n"; + +my $ASMARCH = ""; +my $ASMFORMAT = ""; +my $ASMFILE = ""; + +while() { + if (/^#/) { next; } + ($ASMARCH, $ASMFORMAT, $ASMFILE) = /(.*)\t(.*)\t(.*)/; + if (! $ASMARCH || ! $ASMFORMAT) { next; } + + print "--> Generating assembly for \"$ASMARCH\" \"$ASMFORMAT\"\n"; + system("$perl \'$srcdir/generate-asm.pl\' \'$ASMARCH\' \'$ASMFORMAT\' \'$srcdir/base\' \'$destdir/generated/atomic-$ASMFILE.s\'"); + +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generate-asm.pl b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generate-asm.pl new file mode 100644 index 00000000000..167a2a6e5e5 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generate-asm.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl -w +# +# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +my $asmarch = shift; +my $asmformat = shift; +my $basedir = shift; +my $output = shift; + +if ( ! $asmarch) { + print "usage: generate-asm.pl [ASMARCH] [ASMFORMAT] [BASEDIR] [OUTPUT NAME]\n"; + exit(1); +} + +open(INPUT, "$basedir/$asmarch.asm") || + die "Could not open $basedir/$asmarch.asm: $!\n"; +open(OUTPUT, ">$output") || die "Could not open $output: $!\n"; + +$CONFIG = "default"; +$TEXT = ""; +$GLOBAL = ""; +$SUFFIX = ""; +$GSYM = ""; +$LSYM = ""; +$TYPE = ""; +$SIZE = 0; +$ALIGN_LOG = 0; +$DEL_R_REG = 0; +$IS64BIT = 0; + +($CONFIG, $TEXT, $GLOBAL, $SUFFIX, $GSYM, $LSYM, $TYPE, $SIZE, $ALIGN_LOG, $DEL_R_REG, $IS64BIT, $GNU_STACK) = ( + $asmformat =~ /(.*)\-(.*)\-(.*)\-(.*)\-(.*)\-(.*)\-(.*)\-(.*)\-(.*)\-(.*)\-(.*)\-(.*)/); + +if (0) { +print "$asmformat\n"; +print "CONFIG: $CONFIG\n"; +print "TEXT: $TEXT\n"; +print "GLOBAL: $GLOBAL\n"; +print "SUFFIX: $SUFFIX\n"; +print "GSYM: $GSYM\n"; +print "LSYM: $LSYM\n"; +print "GNU_STACK: $GNU_STACK\n"; +} + +my $current_func = ""; +my $delete = 0; + +# load our configuration +do "$basedir/$CONFIG.conf" or die "Could not open config file $basedir/$CONFIG.conf: $!\n"; + +while () { + s/TEXT/$TEXT/g; + s/GLOBAL/$GLOBAL/g; + s/REFGSYM\((.*)\)/$GSYM$1/g; + s/REFLSYM\((.*)\)/$LSYM$1/g; + s/GSYM\((.*)\)/$GSYM$1$SUFFIX/g; + s/LSYM\((.*)\)/$LSYM$1$SUFFIX/g; + + if ($DEL_R_REG == 0) { + s/cr([0-9][0-9]?)/$1/g; + s/r([0-9][0-9]?)/$1/g; + } + + if (/START_FILE/) { + $_ = start_file(); + } + + if (/START_FUNC\((.*)\)/) { + $current_func = $1; + $_ = start_func($current_func); + } + + if (/END_FUNC\((.*)\)/) { + $current_func = $1; + $_ = end_func($current_func); + } + + if ($ALIGN_LOG == 0) { + s/ALIGN\((\d*)\)/.align $1/g; + } else { + # Ugh... + if (m/ALIGN\((\d*)\)/) { + $val = $1; + $result = 0; + while ($val > 1) { $val /= 2; $result++ } + s/ALIGN\((\d*)\)/.align $result/; + } + } + + if (/^\#START_64BIT/) { + $_ = ""; + if ($IS64BIT == 0) { + $delete = 1; + } + } + if (/^\#END_64BIT/) { + $_ = ""; + $delete = 0; + } + + if ($delete == 0) { + print OUTPUT $_; + } +} + +if ($GNU_STACK == 1) { + if ($asmarch eq "ARM") { + print OUTPUT "\n\t.section\t.note.GNU-stack,\"\",\%progbits\n"; + } else { + print OUTPUT "\n\t.section\t.note.GNU-stack,\"\",\@progbits\n"; + } +} + +close(INPUT); +close(OUTPUT); diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-cygwin-nongas.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-cygwin-nongas.s new file mode 100644 index 00000000000..0eabeddf488 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-cygwin-nongas.s @@ -0,0 +1,109 @@ + .text + + .globl _pmix_atomic_mb +_pmix_atomic_mb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_rmb +_pmix_atomic_rmb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_wmb +_pmix_atomic_wmb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_cmpset_32 +_pmix_atomic_cmpset_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %edx + movl 16(%ebp), %ecx + movl 12(%ebp), %eax + lock; cmpxchgl %ecx,(%edx) + sete %dl + + movzbl %dl, %eax + leave + ret + + + .globl _pmix_atomic_cmpset_64 +_pmix_atomic_cmpset_64: + pushl %ebp + movl %esp, %ebp + subl $32, %esp + movl %ebx, -12(%ebp) + movl %esi, -8(%ebp) + movl %edi, -4(%ebp) + movl 8(%ebp), %edi + movl 12(%ebp), %eax + movl 16(%ebp), %edx + movl %eax, -24(%ebp) + movl %edx, -20(%ebp) + movl 20(%ebp), %eax + movl 24(%ebp), %edx + movl %eax, -32(%ebp) + movl %edx, -28(%ebp) + movl -24(%ebp), %ebx + movl -20(%ebp), %edx + movl -32(%ebp), %esi + movl -28(%ebp), %ecx + movl %ebx, %eax + push %ebx + movl %esi, %ebx + lock; cmpxchg8b (%edi) + sete %dl + pop %ebx + + movzbl %dl, %eax + movl -12(%ebp), %ebx + movl -8(%ebp), %esi + movl -4(%ebp), %edi + movl %ebp, %esp + popl %ebp + ret + + + .globl _pmix_atomic_add_32 +_pmix_atomic_add_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; addl %edx,(%eax) + movl (%eax), %eax + leave + ret + + + .globl _pmix_atomic_sub_32 +_pmix_atomic_sub_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; subl %edx,(%eax) + movl (%eax), %eax + leave + ret + + + .globl _pmix_sys_timer_get_cycles +_pmix_sys_timer_get_cycles: + pushl %ebp + movl %esp, %ebp + rdtsc + popl %ebp + ret diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-cygwin.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-cygwin.s new file mode 100644 index 00000000000..9ffab89085f --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-cygwin.s @@ -0,0 +1,111 @@ + .text + + .globl _pmix_atomic_mb +_pmix_atomic_mb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_rmb +_pmix_atomic_rmb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_wmb +_pmix_atomic_wmb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_cmpset_32 +_pmix_atomic_cmpset_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %edx + movl 16(%ebp), %ecx + movl 12(%ebp), %eax + lock; cmpxchgl %ecx,(%edx) + sete %dl + + movzbl %dl, %eax + leave + ret + + + .globl _pmix_atomic_cmpset_64 +_pmix_atomic_cmpset_64: + pushl %ebp + movl %esp, %ebp + subl $32, %esp + movl %ebx, -12(%ebp) + movl %esi, -8(%ebp) + movl %edi, -4(%ebp) + movl 8(%ebp), %edi + movl 12(%ebp), %eax + movl 16(%ebp), %edx + movl %eax, -24(%ebp) + movl %edx, -20(%ebp) + movl 20(%ebp), %eax + movl 24(%ebp), %edx + movl %eax, -32(%ebp) + movl %edx, -28(%ebp) + movl -24(%ebp), %ebx + movl -20(%ebp), %edx + movl -32(%ebp), %esi + movl -28(%ebp), %ecx + movl %ebx, %eax + push %ebx + movl %esi, %ebx + lock; cmpxchg8b (%edi) + sete %dl + pop %ebx + + movzbl %dl, %eax + movl -12(%ebp), %ebx + movl -8(%ebp), %esi + movl -4(%ebp), %edi + movl %ebp, %esp + popl %ebp + ret + + + .globl _pmix_atomic_add_32 +_pmix_atomic_add_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; addl %edx,(%eax) + movl (%eax), %eax + leave + ret + + + .globl _pmix_atomic_sub_32 +_pmix_atomic_sub_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; subl %edx,(%eax) + movl (%eax), %eax + leave + ret + + + .globl _pmix_sys_timer_get_cycles +_pmix_sys_timer_get_cycles: + pushl %ebp + movl %esp, %ebp + rdtsc + popl %ebp + ret + + .section .note.GNU-stack,"",@progbits diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-linux-nongas.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-linux-nongas.s new file mode 100644 index 00000000000..99971a156e7 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-linux-nongas.s @@ -0,0 +1,125 @@ + .text + + .globl pmix_atomic_mb + .type pmix_atomic_mb, @function +pmix_atomic_mb: + pushl %ebp + movl %esp, %ebp + leave + ret + .size pmix_atomic_mb, .-pmix_atomic_mb + + + .globl pmix_atomic_rmb + .type pmix_atomic_rmb, @function +pmix_atomic_rmb: + pushl %ebp + movl %esp, %ebp + leave + ret + .size pmix_atomic_rmb, .-pmix_atomic_rmb + + + .globl pmix_atomic_wmb + .type pmix_atomic_wmb, @function +pmix_atomic_wmb: + pushl %ebp + movl %esp, %ebp + leave + ret + .size pmix_atomic_wmb, .-pmix_atomic_wmb + + + .globl pmix_atomic_cmpset_32 + .type pmix_atomic_cmpset_32, @function +pmix_atomic_cmpset_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %edx + movl 16(%ebp), %ecx + movl 12(%ebp), %eax + lock; cmpxchgl %ecx,(%edx) + sete %dl + + movzbl %dl, %eax + leave + ret + .size pmix_atomic_cmpset_32, .-pmix_atomic_cmpset_32 + + + .globl pmix_atomic_cmpset_64 + .type pmix_atomic_cmpset_64, @function +pmix_atomic_cmpset_64: + pushl %ebp + movl %esp, %ebp + subl $32, %esp + movl %ebx, -12(%ebp) + movl %esi, -8(%ebp) + movl %edi, -4(%ebp) + movl 8(%ebp), %edi + movl 12(%ebp), %eax + movl 16(%ebp), %edx + movl %eax, -24(%ebp) + movl %edx, -20(%ebp) + movl 20(%ebp), %eax + movl 24(%ebp), %edx + movl %eax, -32(%ebp) + movl %edx, -28(%ebp) + movl -24(%ebp), %ebx + movl -20(%ebp), %edx + movl -32(%ebp), %esi + movl -28(%ebp), %ecx + movl %ebx, %eax + push %ebx + movl %esi, %ebx + lock; cmpxchg8b (%edi) + sete %dl + pop %ebx + + movzbl %dl, %eax + movl -12(%ebp), %ebx + movl -8(%ebp), %esi + movl -4(%ebp), %edi + movl %ebp, %esp + popl %ebp + ret + .size pmix_atomic_cmpset_64, .-pmix_atomic_cmpset_64 + + + .globl pmix_atomic_add_32 + .type pmix_atomic_add_32, @function +pmix_atomic_add_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; addl %edx,(%eax) + movl (%eax), %eax + leave + ret + .size pmix_atomic_add_32, .-pmix_atomic_add_32 + + + .globl pmix_atomic_sub_32 + .type pmix_atomic_sub_32, @function +pmix_atomic_sub_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; subl %edx,(%eax) + movl (%eax), %eax + leave + ret + .size pmix_atomic_sub_32, .-pmix_atomic_sub_32 + + + .globl pmix_sys_timer_get_cycles + .type pmix_sys_timer_get_cycles, @function +pmix_sys_timer_get_cycles: + pushl %ebp + movl %esp, %ebp + rdtsc + popl %ebp + ret + .size pmix_sys_timer_get_cycles, .-pmix_sys_timer_get_cycles diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-linux.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-linux.s new file mode 100644 index 00000000000..a1f639ea514 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-linux.s @@ -0,0 +1,127 @@ + .text + + .globl pmix_atomic_mb + .type pmix_atomic_mb, @function +pmix_atomic_mb: + pushl %ebp + movl %esp, %ebp + leave + ret + .size pmix_atomic_mb, .-pmix_atomic_mb + + + .globl pmix_atomic_rmb + .type pmix_atomic_rmb, @function +pmix_atomic_rmb: + pushl %ebp + movl %esp, %ebp + leave + ret + .size pmix_atomic_rmb, .-pmix_atomic_rmb + + + .globl pmix_atomic_wmb + .type pmix_atomic_wmb, @function +pmix_atomic_wmb: + pushl %ebp + movl %esp, %ebp + leave + ret + .size pmix_atomic_wmb, .-pmix_atomic_wmb + + + .globl pmix_atomic_cmpset_32 + .type pmix_atomic_cmpset_32, @function +pmix_atomic_cmpset_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %edx + movl 16(%ebp), %ecx + movl 12(%ebp), %eax + lock; cmpxchgl %ecx,(%edx) + sete %dl + + movzbl %dl, %eax + leave + ret + .size pmix_atomic_cmpset_32, .-pmix_atomic_cmpset_32 + + + .globl pmix_atomic_cmpset_64 + .type pmix_atomic_cmpset_64, @function +pmix_atomic_cmpset_64: + pushl %ebp + movl %esp, %ebp + subl $32, %esp + movl %ebx, -12(%ebp) + movl %esi, -8(%ebp) + movl %edi, -4(%ebp) + movl 8(%ebp), %edi + movl 12(%ebp), %eax + movl 16(%ebp), %edx + movl %eax, -24(%ebp) + movl %edx, -20(%ebp) + movl 20(%ebp), %eax + movl 24(%ebp), %edx + movl %eax, -32(%ebp) + movl %edx, -28(%ebp) + movl -24(%ebp), %ebx + movl -20(%ebp), %edx + movl -32(%ebp), %esi + movl -28(%ebp), %ecx + movl %ebx, %eax + push %ebx + movl %esi, %ebx + lock; cmpxchg8b (%edi) + sete %dl + pop %ebx + + movzbl %dl, %eax + movl -12(%ebp), %ebx + movl -8(%ebp), %esi + movl -4(%ebp), %edi + movl %ebp, %esp + popl %ebp + ret + .size pmix_atomic_cmpset_64, .-pmix_atomic_cmpset_64 + + + .globl pmix_atomic_add_32 + .type pmix_atomic_add_32, @function +pmix_atomic_add_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; addl %edx,(%eax) + movl (%eax), %eax + leave + ret + .size pmix_atomic_add_32, .-pmix_atomic_add_32 + + + .globl pmix_atomic_sub_32 + .type pmix_atomic_sub_32, @function +pmix_atomic_sub_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; subl %edx,(%eax) + movl (%eax), %eax + leave + ret + .size pmix_atomic_sub_32, .-pmix_atomic_sub_32 + + + .globl pmix_sys_timer_get_cycles + .type pmix_sys_timer_get_cycles, @function +pmix_sys_timer_get_cycles: + pushl %ebp + movl %esp, %ebp + rdtsc + popl %ebp + ret + .size pmix_sys_timer_get_cycles, .-pmix_sys_timer_get_cycles + + .section .note.GNU-stack,"",@progbits diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-osx.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-osx.s new file mode 100644 index 00000000000..0eabeddf488 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia32-osx.s @@ -0,0 +1,109 @@ + .text + + .globl _pmix_atomic_mb +_pmix_atomic_mb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_rmb +_pmix_atomic_rmb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_wmb +_pmix_atomic_wmb: + pushl %ebp + movl %esp, %ebp + leave + ret + + + .globl _pmix_atomic_cmpset_32 +_pmix_atomic_cmpset_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %edx + movl 16(%ebp), %ecx + movl 12(%ebp), %eax + lock; cmpxchgl %ecx,(%edx) + sete %dl + + movzbl %dl, %eax + leave + ret + + + .globl _pmix_atomic_cmpset_64 +_pmix_atomic_cmpset_64: + pushl %ebp + movl %esp, %ebp + subl $32, %esp + movl %ebx, -12(%ebp) + movl %esi, -8(%ebp) + movl %edi, -4(%ebp) + movl 8(%ebp), %edi + movl 12(%ebp), %eax + movl 16(%ebp), %edx + movl %eax, -24(%ebp) + movl %edx, -20(%ebp) + movl 20(%ebp), %eax + movl 24(%ebp), %edx + movl %eax, -32(%ebp) + movl %edx, -28(%ebp) + movl -24(%ebp), %ebx + movl -20(%ebp), %edx + movl -32(%ebp), %esi + movl -28(%ebp), %ecx + movl %ebx, %eax + push %ebx + movl %esi, %ebx + lock; cmpxchg8b (%edi) + sete %dl + pop %ebx + + movzbl %dl, %eax + movl -12(%ebp), %ebx + movl -8(%ebp), %esi + movl -4(%ebp), %edi + movl %ebp, %esp + popl %ebp + ret + + + .globl _pmix_atomic_add_32 +_pmix_atomic_add_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; addl %edx,(%eax) + movl (%eax), %eax + leave + ret + + + .globl _pmix_atomic_sub_32 +_pmix_atomic_sub_32: + pushl %ebp + movl %esp, %ebp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + lock; subl %edx,(%eax) + movl (%eax), %eax + leave + ret + + + .globl _pmix_sys_timer_get_cycles +_pmix_sys_timer_get_cycles: + pushl %ebp + movl %esp, %ebp + rdtsc + popl %ebp + ret diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia64-linux-nongas.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia64-linux-nongas.s new file mode 100644 index 00000000000..9e13953f4bd --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia64-linux-nongas.s @@ -0,0 +1,108 @@ + + .pred.safe_across_calls p1-p5,p16-p63 + .text + .align 16 + .global pmix_atomic_mb# + .proc pmix_atomic_mb# +pmix_atomic_mb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_mb# + .align 16 + .global pmix_atomic_rmb# + .proc pmix_atomic_rmb# +pmix_atomic_rmb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_rmb# + .align 16 + .global pmix_atomic_wmb# + .proc pmix_atomic_wmb# +pmix_atomic_wmb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_wmb# + .align 16 + .global pmix_atomic_cmpset_acq_32# + .proc pmix_atomic_cmpset_acq_32# +pmix_atomic_cmpset_acq_32: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg4.acq r32=[r32],r34,ar.ccv + ;; + cmp4.eq p6, p7 = r32, r33 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_acq_32# + .align 16 + .global pmix_atomic_cmpset_rel_32# + .proc pmix_atomic_cmpset_rel_32# +pmix_atomic_cmpset_rel_32: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg4.rel r32=[r32],r34,ar.ccv + ;; + cmp4.eq p6, p7 = r32, r33 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_rel_32# + .align 16 + .global pmix_atomic_cmpset_acq_64# + .proc pmix_atomic_cmpset_acq_64# +pmix_atomic_cmpset_acq_64: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg8.acq r32=[r32],r34,ar.ccv + ;; + cmp.eq p6, p7 = r33, r32 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_acq_64# + .align 16 + .global pmix_atomic_cmpset_rel_64# + .proc pmix_atomic_cmpset_rel_64# +pmix_atomic_cmpset_rel_64: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg8.rel r32=[r32],r34,ar.ccv + ;; + cmp.eq p6, p7 = r33, r32 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_rel_64# + .align 16 + .global pmix_sys_timer_get_cycles# + .proc pmix_sys_timer_get_cycles# +pmix_sys_timer_get_cycles: + .prologue + .body + mov r8=ar.itc + br.ret.sptk.many b0 + ;; + .endp pmix_sys_timer_get_cycles# + .ident "GCC: (GNU) 3.2.3 20030502 (Red Hat Linux 3.2.3-49)" diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia64-linux.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia64-linux.s new file mode 100644 index 00000000000..2bc097f2af1 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-ia64-linux.s @@ -0,0 +1,110 @@ + + .pred.safe_across_calls p1-p5,p16-p63 + .text + .align 16 + .global pmix_atomic_mb# + .proc pmix_atomic_mb# +pmix_atomic_mb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_mb# + .align 16 + .global pmix_atomic_rmb# + .proc pmix_atomic_rmb# +pmix_atomic_rmb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_rmb# + .align 16 + .global pmix_atomic_wmb# + .proc pmix_atomic_wmb# +pmix_atomic_wmb: + .prologue + .body + mf + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_wmb# + .align 16 + .global pmix_atomic_cmpset_acq_32# + .proc pmix_atomic_cmpset_acq_32# +pmix_atomic_cmpset_acq_32: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg4.acq r32=[r32],r34,ar.ccv + ;; + cmp4.eq p6, p7 = r32, r33 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_acq_32# + .align 16 + .global pmix_atomic_cmpset_rel_32# + .proc pmix_atomic_cmpset_rel_32# +pmix_atomic_cmpset_rel_32: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg4.rel r32=[r32],r34,ar.ccv + ;; + cmp4.eq p6, p7 = r32, r33 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_rel_32# + .align 16 + .global pmix_atomic_cmpset_acq_64# + .proc pmix_atomic_cmpset_acq_64# +pmix_atomic_cmpset_acq_64: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg8.acq r32=[r32],r34,ar.ccv + ;; + cmp.eq p6, p7 = r33, r32 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_acq_64# + .align 16 + .global pmix_atomic_cmpset_rel_64# + .proc pmix_atomic_cmpset_rel_64# +pmix_atomic_cmpset_rel_64: + .prologue + .body + mov ar.ccv=r33;; + cmpxchg8.rel r32=[r32],r34,ar.ccv + ;; + cmp.eq p6, p7 = r33, r32 + ;; + (p6) addl r8 = 1, r0 + (p7) mov r8 = r0 + br.ret.sptk.many b0 + ;; + .endp pmix_atomic_cmpset_rel_64# + .align 16 + .global pmix_sys_timer_get_cycles# + .proc pmix_sys_timer_get_cycles# +pmix_sys_timer_get_cycles: + .prologue + .body + mov r8=ar.itc + br.ret.sptk.many b0 + ;; + .endp pmix_sys_timer_get_cycles# + .ident "GCC: (GNU) 3.2.3 20030502 (Red Hat Linux 3.2.3-49)" + + .section .note.GNU-stack,"",@progbits diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips-irix.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips-irix.s new file mode 100644 index 00000000000..27d4ae3d87b --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips-irix.s @@ -0,0 +1,195 @@ + +#ifdef __linux__ +#include +#else +#include +#endif +#include + + .text + + .align 3 +LEAF(pmix_atomic_mb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_mb) + + + .align 3 +LEAF(pmix_atomic_rmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_rmb) + + +LEAF(pmix_atomic_wmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_wmb) + + +LEAF(pmix_atomic_cmpset_32) + .set noreorder +retry1: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done1 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry1 +done1: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_32) + + +LEAF(pmix_atomic_cmpset_acq_32) + .set noreorder +retry2: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done2 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry2 +done2: +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_32) + + +LEAF(pmix_atomic_cmpset_rel_32) + .set noreorder +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif +retry3: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done3 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry3 +done3: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_32) + +#ifdef __mips64 +LEAF(pmix_atomic_cmpset_64) + .set noreorder +retry4: + lld $3, 0($4) + bne $3, $5, done4 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry4 +done4: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_64) + + +LEAF(pmix_atomic_cmpset_acq_64) + .set noreorder +retry5: + lld $3, 0($4) + bne $3, $5, done5 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry5 +done5: + sync + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_64) + + +LEAF(pmix_atomic_cmpset_rel_64) + .set noreorder + sync +retry6: + lld $3, 0($4) + bne $3, $5, done6 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry6 +done6: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_64) +#endif /* __mips64 */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips-linux.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips-linux.s new file mode 100644 index 00000000000..9339285f890 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips-linux.s @@ -0,0 +1,197 @@ + +#ifdef __linux__ +#include +#else +#include +#endif +#include + + .text + + .align 3 +LEAF(pmix_atomic_mb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_mb) + + + .align 3 +LEAF(pmix_atomic_rmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_rmb) + + +LEAF(pmix_atomic_wmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_wmb) + + +LEAF(pmix_atomic_cmpset_32) + .set noreorder +retry1: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done1 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry1 +done1: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_32) + + +LEAF(pmix_atomic_cmpset_acq_32) + .set noreorder +retry2: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done2 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry2 +done2: +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_32) + + +LEAF(pmix_atomic_cmpset_rel_32) + .set noreorder +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif +retry3: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done3 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry3 +done3: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_32) + +#ifdef __mips64 +LEAF(pmix_atomic_cmpset_64) + .set noreorder +retry4: + lld $3, 0($4) + bne $3, $5, done4 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry4 +done4: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_64) + + +LEAF(pmix_atomic_cmpset_acq_64) + .set noreorder +retry5: + lld $3, 0($4) + bne $3, $5, done5 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry5 +done5: + sync + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_64) + + +LEAF(pmix_atomic_cmpset_rel_64) + .set noreorder + sync +retry6: + lld $3, 0($4) + bne $3, $5, done6 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry6 +done6: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_64) +#endif /* __mips64 */ + + .section .note.GNU-stack,"",@progbits diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips64-linux.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips64-linux.s new file mode 100644 index 00000000000..9339285f890 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips64-linux.s @@ -0,0 +1,197 @@ + +#ifdef __linux__ +#include +#else +#include +#endif +#include + + .text + + .align 3 +LEAF(pmix_atomic_mb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_mb) + + + .align 3 +LEAF(pmix_atomic_rmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_rmb) + + +LEAF(pmix_atomic_wmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_wmb) + + +LEAF(pmix_atomic_cmpset_32) + .set noreorder +retry1: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done1 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry1 +done1: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_32) + + +LEAF(pmix_atomic_cmpset_acq_32) + .set noreorder +retry2: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done2 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry2 +done2: +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_32) + + +LEAF(pmix_atomic_cmpset_rel_32) + .set noreorder +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif +retry3: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done3 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry3 +done3: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_32) + +#ifdef __mips64 +LEAF(pmix_atomic_cmpset_64) + .set noreorder +retry4: + lld $3, 0($4) + bne $3, $5, done4 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry4 +done4: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_64) + + +LEAF(pmix_atomic_cmpset_acq_64) + .set noreorder +retry5: + lld $3, 0($4) + bne $3, $5, done5 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry5 +done5: + sync + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_64) + + +LEAF(pmix_atomic_cmpset_rel_64) + .set noreorder + sync +retry6: + lld $3, 0($4) + bne $3, $5, done6 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry6 +done6: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_64) +#endif /* __mips64 */ + + .section .note.GNU-stack,"",@progbits diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips64el.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips64el.s new file mode 100644 index 00000000000..27d4ae3d87b --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-mips64el.s @@ -0,0 +1,195 @@ + +#ifdef __linux__ +#include +#else +#include +#endif +#include + + .text + + .align 3 +LEAF(pmix_atomic_mb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_mb) + + + .align 3 +LEAF(pmix_atomic_rmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_rmb) + + +LEAF(pmix_atomic_wmb) +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + j ra +END(pmix_atomic_wmb) + + +LEAF(pmix_atomic_cmpset_32) + .set noreorder +retry1: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done1 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry1 +done1: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_32) + + +LEAF(pmix_atomic_cmpset_acq_32) + .set noreorder +retry2: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done2 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry2 +done2: +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_32) + + +LEAF(pmix_atomic_cmpset_rel_32) + .set noreorder +#ifdef __linux__ + .set mips2 +#endif + sync +#ifdef __linux__ + .set mips0 +#endif +retry3: +#ifdef __linux__ + .set mips2 +#endif + ll $3, 0($4) +#ifdef __linux__ + .set mips0 +#endif + bne $3, $5, done3 + or $2, $6, 0 +#ifdef __linux__ + .set mips2 +#endif + sc $2, 0($4) +#ifdef __linux__ + .set mips0 +#endif + beqz $2, retry3 +done3: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_32) + +#ifdef __mips64 +LEAF(pmix_atomic_cmpset_64) + .set noreorder +retry4: + lld $3, 0($4) + bne $3, $5, done4 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry4 +done4: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_64) + + +LEAF(pmix_atomic_cmpset_acq_64) + .set noreorder +retry5: + lld $3, 0($4) + bne $3, $5, done5 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry5 +done5: + sync + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_acq_64) + + +LEAF(pmix_atomic_cmpset_rel_64) + .set noreorder + sync +retry6: + lld $3, 0($4) + bne $3, $5, done6 + or $2, $6, 0 + scd $2, 0($4) + beqz $2, retry6 +done6: + xor $3,$3,$5 + j ra + sltu $2,$3,1 + .set reorder +END(pmix_atomic_cmpset_rel_64) +#endif /* __mips64 */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-64-osx.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-64-osx.s new file mode 100644 index 00000000000..ebe9d8ad2bb --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-64-osx.s @@ -0,0 +1,165 @@ + .text + + .align 2 + .globl _pmix_atomic_mb +_pmix_atomic_mb: + sync + blr + + + .globl _pmix_atomic_rmb +_pmix_atomic_rmb: + lwsync + blr + + + .globl _pmix_atomic_wmb +_pmix_atomic_wmb: + eieio + blr + + + .globl _pmix_atomic_cmpset_32 +_pmix_atomic_cmpset_32: + L1: lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- L2 + stwcx. r5, 0, r3 + bne- L1 + L2: + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + blr + + + .globl _pmix_atomic_cmpset_acq_32 +_pmix_atomic_cmpset_acq_32: + L3: lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- L4 + stwcx. r5, 0, r3 + bne- L3 + sync + L4: + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + lwsync + blr + + + .globl _pmix_atomic_cmpset_rel_32 +_pmix_atomic_cmpset_rel_32: + eieio + L5: lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- L6 + stwcx. r5, 0, r3 + bne- L5 + sync + L6: + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + blr + + .globl _pmix_atomic_cmpset_64 +_pmix_atomic_cmpset_64: + stw r4,-32(r1) + stw r5,-28(r1) + stw r6,-24(r1) + stw r7,-20(r1) + ld r5,-32(r1) + ld r7,-24(r1) + L7: ldarx r9, 0, r3 + cmpd 0, r9, r5 + bne- L8 + stdcx. r7, 0, r3 + bne- L7 + L8: + xor r3,r5,r9 + subfic r5,r3,0 + adde r3,r5,r3 + blr + + + .globl _pmix_atomic_cmpset_acq_64 +_pmix_atomic_cmpset_acq_64: + stw r4,-32(r1) + stw r5,-28(r1) + stw r6,-24(r1) + stw r7,-20(r1) + ld r5,-32(r1) + ld r7,-24(r1) + + L9: ldarx r9, 0, r3 + cmpd 0, r9, r5 + bne- L10 + stdcx. r7, 0, r3 + bne- L9 + L10: + xor r3,r5,r9 + subfic r5,r3,0 + adde r3,r5,r3 + blr + lwsync + blr + + + .globl _pmix_atomic_cmpset_rel_64 +_pmix_atomic_cmpset_rel_64: + stw r4,-32(r1) + stw r5,-28(r1) + stw r6,-24(r1) + stw r7,-20(r1) + ld r5,-32(r1) + ld r7,-24(r1) + + eieio + L11: ldarx r9, 0, r3 + cmpd 0, r9, r5 + bne- L12 + stdcx. r7, 0, r3 + bne- L11 + L12: + xor r3,r5,r9 + subfic r5,r3,0 + adde r3,r5,r3 + blr + lwsync + blr + + + .globl _pmix_atomic_add_32 +_pmix_atomic_add_32: + L13: lwarx r0, 0, r3 + add r0, r4, r0 + stwcx. r0, 0, r3 + bne- L13 + mr r3,r0 + blr + + + .globl _pmix_atomic_sub_32 +_pmix_atomic_sub_32: + L14: lwarx r0,0,r3 + subf r0,r4,r0 + stwcx. r0,0,r3 + bne- L14 + mr r3,r0 + blr + + .globl _pmix_sys_timer_get_cycles +_pmix_sys_timer_get_cycles: + L15: + mftbu r0 + mftb r11 + mftbu r2 + cmpw cr7,r2,r0 + bne+ cr7,L15 + li r4,0 + li r9,0 + or r3,r2,r9 + or r4,r4,r11 + blr diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-aix.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-aix.s new file mode 100644 index 00000000000..7cc2ba0b9dc --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-aix.s @@ -0,0 +1,156 @@ + .machine "ppc" + .toc + .csect .text[PR] + + .align 2 + .globl pmix_atomic_mb + .globl .pmix_atomic_mb + .csect [DS],3 +pmix_atomic_mb: + .long .pmix_atomic_mb, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_mb: + sync + blr + + + .globl pmix_atomic_rmb + .globl .pmix_atomic_rmb + .csect [DS],3 +pmix_atomic_rmb: + .long .pmix_atomic_rmb, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_rmb: + lwsync + blr + + + .globl pmix_atomic_wmb + .globl .pmix_atomic_wmb + .csect [DS],3 +pmix_atomic_wmb: + .long .pmix_atomic_wmb, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_wmb: + eieio + blr + + + .globl pmix_atomic_cmpset_32 + .globl .pmix_atomic_cmpset_32 + .csect [DS],3 +pmix_atomic_cmpset_32: + .long .pmix_atomic_cmpset_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_32: + L1: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- L2 + stwcx. 5, 0, 3 + bne- L1 + L2: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + blr + + + .globl pmix_atomic_cmpset_acq_32 + .globl .pmix_atomic_cmpset_acq_32 + .csect [DS],3 +pmix_atomic_cmpset_acq_32: + .long .pmix_atomic_cmpset_acq_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_acq_32: + L3: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- L4 + stwcx. 5, 0, 3 + bne- L3 + sync + L4: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + lwsync + blr + + + .globl pmix_atomic_cmpset_rel_32 + .globl .pmix_atomic_cmpset_rel_32 + .csect [DS],3 +pmix_atomic_cmpset_rel_32: + .long .pmix_atomic_cmpset_rel_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_rel_32: + eieio + L5: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- L6 + stwcx. 5, 0, 3 + bne- L5 + sync + L6: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + blr + + + + .globl pmix_atomic_add_32 + .globl .pmix_atomic_add_32 + .csect [DS],3 +pmix_atomic_add_32: + .long .pmix_atomic_add_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_add_32: + L13: lwarx 0, 0, 3 + add 0, 4, 0 + stwcx. 0, 0, 3 + bne- L13 + mr 3,0 + blr + + + .globl pmix_atomic_sub_32 + .globl .pmix_atomic_sub_32 + .csect [DS],3 +pmix_atomic_sub_32: + .long .pmix_atomic_sub_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_sub_32: + L14: lwarx 0,0,3 + subf 0,4,0 + stwcx. 0,0,3 + bne- L14 + mr 3,0 + blr + + .globl pmix_sys_timer_get_cycles + .globl .pmix_sys_timer_get_cycles + .csect [DS],3 +pmix_sys_timer_get_cycles: + .long .pmix_sys_timer_get_cycles, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_sys_timer_get_cycles: + L15: + mftbu 0 + mftb 11 + mftbu 2 + cmpw 7,2,0 + bne+ 7,L15 + li 4,0 + li 9,0 + or 3,2,9 + or 4,4,11 + blr diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-linux-nongas.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-linux-nongas.s new file mode 100644 index 00000000000..37b36c22b08 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-linux-nongas.s @@ -0,0 +1,118 @@ + .text + + .align 2 + .globl pmix_atomic_mb + .type pmix_atomic_mb, @function +pmix_atomic_mb: + sync + blr + .size pmix_atomic_mb, .-pmix_atomic_mb + + + .globl pmix_atomic_rmb + .type pmix_atomic_rmb, @function +pmix_atomic_rmb: + lwsync + blr + .size pmix_atomic_rmb, .-pmix_atomic_rmb + + + .globl pmix_atomic_wmb + .type pmix_atomic_wmb, @function +pmix_atomic_wmb: + eieio + blr + .size pmix_atomic_wmb, .-pmix_atomic_wmb + + + .globl pmix_atomic_cmpset_32 + .type pmix_atomic_cmpset_32, @function +pmix_atomic_cmpset_32: + .L1: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- .L2 + stwcx. 5, 0, 3 + bne- .L1 + .L2: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + blr + .size pmix_atomic_cmpset_32, .-pmix_atomic_cmpset_32 + + + .globl pmix_atomic_cmpset_acq_32 + .type pmix_atomic_cmpset_acq_32, @function +pmix_atomic_cmpset_acq_32: + .L3: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- .L4 + stwcx. 5, 0, 3 + bne- .L3 + sync + .L4: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + lwsync + blr + .size pmix_atomic_cmpset_acq_32, .-pmix_atomic_cmpset_acq_32 + + + .globl pmix_atomic_cmpset_rel_32 + .type pmix_atomic_cmpset_rel_32, @function +pmix_atomic_cmpset_rel_32: + eieio + .L5: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- .L6 + stwcx. 5, 0, 3 + bne- .L5 + sync + .L6: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + blr + .size pmix_atomic_cmpset_rel_32, .-pmix_atomic_cmpset_rel_32 + + + + .globl pmix_atomic_add_32 + .type pmix_atomic_add_32, @function +pmix_atomic_add_32: + .L13: lwarx 0, 0, 3 + add 0, 4, 0 + stwcx. 0, 0, 3 + bne- .L13 + mr 3,0 + blr + .size pmix_atomic_add_32, .-pmix_atomic_add_32 + + + .globl pmix_atomic_sub_32 + .type pmix_atomic_sub_32, @function +pmix_atomic_sub_32: + .L14: lwarx 0,0,3 + subf 0,4,0 + stwcx. 0,0,3 + bne- .L14 + mr 3,0 + blr + .size pmix_atomic_sub_32, .-pmix_atomic_sub_32 + + .globl pmix_sys_timer_get_cycles + .type pmix_sys_timer_get_cycles, @function +pmix_sys_timer_get_cycles: + .L15: + mftbu 0 + mftb 11 + mftbu 2 + cmpw 7,2,0 + bne+ 7,.L15 + li 4,0 + li 9,0 + or 3,2,9 + or 4,4,11 + blr + .size pmix_sys_timer_get_cycles, .-pmix_sys_timer_get_cycles diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-linux.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-linux.s new file mode 100644 index 00000000000..afecd0a305a --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-linux.s @@ -0,0 +1,120 @@ + .text + + .align 2 + .globl pmix_atomic_mb + .type pmix_atomic_mb, @function +pmix_atomic_mb: + sync + blr + .size pmix_atomic_mb, .-pmix_atomic_mb + + + .globl pmix_atomic_rmb + .type pmix_atomic_rmb, @function +pmix_atomic_rmb: + lwsync + blr + .size pmix_atomic_rmb, .-pmix_atomic_rmb + + + .globl pmix_atomic_wmb + .type pmix_atomic_wmb, @function +pmix_atomic_wmb: + eieio + blr + .size pmix_atomic_wmb, .-pmix_atomic_wmb + + + .globl pmix_atomic_cmpset_32 + .type pmix_atomic_cmpset_32, @function +pmix_atomic_cmpset_32: + .L1: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- .L2 + stwcx. 5, 0, 3 + bne- .L1 + .L2: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + blr + .size pmix_atomic_cmpset_32, .-pmix_atomic_cmpset_32 + + + .globl pmix_atomic_cmpset_acq_32 + .type pmix_atomic_cmpset_acq_32, @function +pmix_atomic_cmpset_acq_32: + .L3: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- .L4 + stwcx. 5, 0, 3 + bne- .L3 + sync + .L4: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + lwsync + blr + .size pmix_atomic_cmpset_acq_32, .-pmix_atomic_cmpset_acq_32 + + + .globl pmix_atomic_cmpset_rel_32 + .type pmix_atomic_cmpset_rel_32, @function +pmix_atomic_cmpset_rel_32: + eieio + .L5: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- .L6 + stwcx. 5, 0, 3 + bne- .L5 + sync + .L6: + xor 3,0,4 + subfic 5,3,0 + adde 3,5,3 + blr + .size pmix_atomic_cmpset_rel_32, .-pmix_atomic_cmpset_rel_32 + + + + .globl pmix_atomic_add_32 + .type pmix_atomic_add_32, @function +pmix_atomic_add_32: + .L13: lwarx 0, 0, 3 + add 0, 4, 0 + stwcx. 0, 0, 3 + bne- .L13 + mr 3,0 + blr + .size pmix_atomic_add_32, .-pmix_atomic_add_32 + + + .globl pmix_atomic_sub_32 + .type pmix_atomic_sub_32, @function +pmix_atomic_sub_32: + .L14: lwarx 0,0,3 + subf 0,4,0 + stwcx. 0,0,3 + bne- .L14 + mr 3,0 + blr + .size pmix_atomic_sub_32, .-pmix_atomic_sub_32 + + .globl pmix_sys_timer_get_cycles + .type pmix_sys_timer_get_cycles, @function +pmix_sys_timer_get_cycles: + .L15: + mftbu 0 + mftb 11 + mftbu 2 + cmpw 7,2,0 + bne+ 7,.L15 + li 4,0 + li 9,0 + or 3,2,9 + or 4,4,11 + blr + .size pmix_sys_timer_get_cycles, .-pmix_sys_timer_get_cycles + + .section .note.GNU-stack,"",@progbits diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-osx.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-osx.s new file mode 100644 index 00000000000..7d2dceb2a81 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc32-osx.s @@ -0,0 +1,100 @@ + .text + + .align 2 + .globl _pmix_atomic_mb +_pmix_atomic_mb: + sync + blr + + + .globl _pmix_atomic_rmb +_pmix_atomic_rmb: + lwsync + blr + + + .globl _pmix_atomic_wmb +_pmix_atomic_wmb: + eieio + blr + + + .globl _pmix_atomic_cmpset_32 +_pmix_atomic_cmpset_32: + L1: lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- L2 + stwcx. r5, 0, r3 + bne- L1 + L2: + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + blr + + + .globl _pmix_atomic_cmpset_acq_32 +_pmix_atomic_cmpset_acq_32: + L3: lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- L4 + stwcx. r5, 0, r3 + bne- L3 + sync + L4: + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + lwsync + blr + + + .globl _pmix_atomic_cmpset_rel_32 +_pmix_atomic_cmpset_rel_32: + eieio + L5: lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- L6 + stwcx. r5, 0, r3 + bne- L5 + sync + L6: + xor r3,r0,r4 + subfic r5,r3,0 + adde r3,r5,r3 + blr + + + + .globl _pmix_atomic_add_32 +_pmix_atomic_add_32: + L13: lwarx r0, 0, r3 + add r0, r4, r0 + stwcx. r0, 0, r3 + bne- L13 + mr r3,r0 + blr + + + .globl _pmix_atomic_sub_32 +_pmix_atomic_sub_32: + L14: lwarx r0,0,r3 + subf r0,r4,r0 + stwcx. r0,0,r3 + bne- L14 + mr r3,r0 + blr + + .globl _pmix_sys_timer_get_cycles +_pmix_sys_timer_get_cycles: + L15: + mftbu r0 + mftb r11 + mftbu r2 + cmpw cr7,r2,r0 + bne+ cr7,L15 + li r4,0 + li r9,0 + or r3,r2,r9 + or r4,r4,r11 + blr diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-aix.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-aix.s new file mode 100644 index 00000000000..7e3995e3512 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-aix.s @@ -0,0 +1,230 @@ + .machine "ppc64" + .toc + .csect .text[PR] + + .align 2 + .globl pmix_atomic_mb + .globl .pmix_atomic_mb + .csect [DS],3 +pmix_atomic_mb: + .llong .pmix_atomic_mb, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_mb: + sync + blr + + + .globl pmix_atomic_rmb + .globl .pmix_atomic_rmb + .csect [DS],3 +pmix_atomic_rmb: + .llong .pmix_atomic_rmb, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_rmb: + lwsync + blr + + + .globl pmix_atomic_wmb + .globl .pmix_atomic_wmb + .csect [DS],3 +pmix_atomic_wmb: + .llong .pmix_atomic_wmb, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_wmb: + eieio + blr + + + .globl pmix_atomic_cmpset_32 + .globl .pmix_atomic_cmpset_32 + .csect [DS],3 +pmix_atomic_cmpset_32: + .llong .pmix_atomic_cmpset_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_32: + L1: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- L2 + stwcx. 5, 0, 3 + bne- L1 + L2: + cmpw 7,0,4 + mfcr 3 + rlwinm 3,3,31,1 + blr + + + .globl pmix_atomic_cmpset_acq_32 + .globl .pmix_atomic_cmpset_acq_32 + .csect [DS],3 +pmix_atomic_cmpset_acq_32: + .llong .pmix_atomic_cmpset_acq_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_acq_32: + mflr 0 + std 29,-24(1) + std 0,16(1) + stdu 1,-144(1) + bl .pmix_atomic_cmpset_32 + mr 29,3 + bl .pmix_atomic_rmb + mr 3,29 + addi 1,1,144 + ld 0,16(1) + mtlr 0 + ld 29,-24(1) + blr + + + .globl pmix_atomic_cmpset_rel_32 + .globl .pmix_atomic_cmpset_rel_32 + .csect [DS],3 +pmix_atomic_cmpset_rel_32: + .llong .pmix_atomic_cmpset_rel_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_rel_32: + mflr 0 + std 27,-40(1) + std 28,-32(1) + std 29,-24(1) + std 0,16(1) + stdu 1,-160(1) + mr 29,3 + mr 28,4 + mr 27,5 + bl .pmix_atomic_wmb + mr 3,29 + mr 4,28 + mr 5,27 + bl .pmix_atomic_cmpset_32 + addi 1,1,160 + ld 0,16(1) + mtlr 0 + ld 27,-40(1) + ld 28,-32(1) + ld 29,-24(1) + blr + + + .globl pmix_atomic_cmpset_64 + .globl .pmix_atomic_cmpset_64 + .csect [DS],3 +pmix_atomic_cmpset_64: + .llong .pmix_atomic_cmpset_64, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_64: + L3: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- L4 + stdcx. 5, 0, 3 + bne- L3 + L4: + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + + + .globl pmix_atomic_cmpset_acq_64 + .globl .pmix_atomic_cmpset_acq_64 + .csect [DS],3 +pmix_atomic_cmpset_acq_64: + .llong .pmix_atomic_cmpset_acq_64, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_acq_64: + L7: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- L8 + stdcx. 5, 0, 3 + bne- L7 + L8: + lwsync + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + + + .globl pmix_atomic_cmpset_rel_64 + .globl .pmix_atomic_cmpset_rel_64 + .csect [DS],3 +pmix_atomic_cmpset_rel_64: + .llong .pmix_atomic_cmpset_rel_64, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_cmpset_rel_64: + eieio + L9: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- L10 + stdcx. 5, 0, 3 + bne- L9 + L10: + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + + + .globl pmix_atomic_add_32 + .globl .pmix_atomic_add_32 + .csect [DS],3 +pmix_atomic_add_32: + .llong .pmix_atomic_add_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_add_32: + L5: lwarx 0, 0, 3 + add 0, 4, 0 + stwcx. 0, 0, 3 + bne- L5 + + mr 3,0 + blr + + + .globl pmix_atomic_sub_32 + .globl .pmix_atomic_sub_32 + .csect [DS],3 +pmix_atomic_sub_32: + .llong .pmix_atomic_sub_32, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_atomic_sub_32: + L6: lwarx 0,0,3 + subf 0,4,0 + stwcx. 0,0,3 + bne- L6 + + mr 3,0 + blr + + .globl pmix_sys_timer_get_cycles + .globl .pmix_sys_timer_get_cycles + .csect [DS],3 +pmix_sys_timer_get_cycles: + .llong .pmix_sys_timer_get_cycles, TOC[tc0], 0 + .csect [PR] + .align 2 +.pmix_sys_timer_get_cycles: + L11: + mftbu 2 + rldicl 2,2,0,32 + mftb 0 + rldicl 9,0,0,32 + mftbu 0 + rldicl 0,0,0,32 + cmpw 7,0,2 + bne 7,L11 + sldi 3,0,32 + or 3,3,9 + blr diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-linux-nongas.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-linux-nongas.s new file mode 100644 index 00000000000..1bb4731ae32 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-linux-nongas.s @@ -0,0 +1,180 @@ + .text + + .align 2 + .globl .pmix_atomic_mb + .type .pmix_atomic_mb, @function +.pmix_atomic_mb: + sync + blr + .size .pmix_atomic_mb, .-.pmix_atomic_mb + + + .globl .pmix_atomic_rmb + .type .pmix_atomic_rmb, @function +.pmix_atomic_rmb: + lwsync + blr + .size .pmix_atomic_rmb, .-.pmix_atomic_rmb + + + .globl .pmix_atomic_wmb + .type .pmix_atomic_wmb, @function +.pmix_atomic_wmb: + eieio + blr + .size .pmix_atomic_wmb, .-.pmix_atomic_wmb + + + .globl .pmix_atomic_cmpset_32 + .type .pmix_atomic_cmpset_32, @function +.pmix_atomic_cmpset_32: + .L1: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- .L2 + stwcx. 5, 0, 3 + bne- .L1 + .L2: + cmpw 7,0,4 + mfcr 3 + rlwinm 3,3,31,1 + blr + .size .pmix_atomic_cmpset_32, .-.pmix_atomic_cmpset_32 + + + .globl .pmix_atomic_cmpset_acq_32 + .type .pmix_atomic_cmpset_acq_32, @function +.pmix_atomic_cmpset_acq_32: + mflr 0 + std 29,-24(1) + std 0,16(1) + stdu 1,-144(1) + bl .pmix_atomic_cmpset_32 + mr 29,3 + bl .pmix_atomic_rmb + mr 3,29 + addi 1,1,144 + ld 0,16(1) + mtlr 0 + ld 29,-24(1) + blr + .size .pmix_atomic_cmpset_acq_32, .-.pmix_atomic_cmpset_acq_32 + + + .globl .pmix_atomic_cmpset_rel_32 + .type .pmix_atomic_cmpset_rel_32, @function +.pmix_atomic_cmpset_rel_32: + mflr 0 + std 27,-40(1) + std 28,-32(1) + std 29,-24(1) + std 0,16(1) + stdu 1,-160(1) + mr 29,3 + mr 28,4 + mr 27,5 + bl .pmix_atomic_wmb + mr 3,29 + mr 4,28 + mr 5,27 + bl .pmix_atomic_cmpset_32 + addi 1,1,160 + ld 0,16(1) + mtlr 0 + ld 27,-40(1) + ld 28,-32(1) + ld 29,-24(1) + blr + .size .pmix_atomic_cmpset_rel_32, .-.pmix_atomic_cmpset_rel_32 + + + .globl .pmix_atomic_cmpset_64 + .type .pmix_atomic_cmpset_64, @function +.pmix_atomic_cmpset_64: + .L3: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- .L4 + stdcx. 5, 0, 3 + bne- .L3 + .L4: + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + .size .pmix_atomic_cmpset_64, .-.pmix_atomic_cmpset_64 + + + .globl .pmix_atomic_cmpset_acq_64 + .type .pmix_atomic_cmpset_acq_64, @function +.pmix_atomic_cmpset_acq_64: + .L7: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- .L8 + stdcx. 5, 0, 3 + bne- .L7 + .L8: + lwsync + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + .size .pmix_atomic_cmpset_acq_64, .-.pmix_atomic_cmpset_acq_64 + + + .globl .pmix_atomic_cmpset_rel_64 + .type .pmix_atomic_cmpset_rel_64, @function +.pmix_atomic_cmpset_rel_64: + eieio + .L9: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- .L10 + stdcx. 5, 0, 3 + bne- .L9 + .L10: + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + .size .pmix_atomic_cmpset_rel_64, .-.pmix_atomic_cmpset_rel_64 + + + .globl .pmix_atomic_add_32 + .type .pmix_atomic_add_32, @function +.pmix_atomic_add_32: + .L5: lwarx 0, 0, 3 + add 0, 4, 0 + stwcx. 0, 0, 3 + bne- .L5 + + mr 3,0 + blr + .size .pmix_atomic_add_32, .-.pmix_atomic_add_32 + + + .globl .pmix_atomic_sub_32 + .type .pmix_atomic_sub_32, @function +.pmix_atomic_sub_32: + .L6: lwarx 0,0,3 + subf 0,4,0 + stwcx. 0,0,3 + bne- .L6 + + mr 3,0 + blr + .size .pmix_atomic_sub_32, .-.pmix_atomic_sub_32 + + .globl .pmix_sys_timer_get_cycles + .type .pmix_sys_timer_get_cycles, @function +.pmix_sys_timer_get_cycles: + .L11: + mftbu 2 + rldicl 2,2,0,32 + mftb 0 + rldicl 9,0,0,32 + mftbu 0 + rldicl 0,0,0,32 + cmpw 7,0,2 + bne 7,.L11 + sldi 3,0,32 + or 3,3,9 + blr + .size .pmix_sys_timer_get_cycles, .-.pmix_sys_timer_get_cycles diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-linux.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-linux.s new file mode 100644 index 00000000000..300d0aa0d70 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-linux.s @@ -0,0 +1,182 @@ + .text + + .align 2 + .globl .pmix_atomic_mb + .type .pmix_atomic_mb, @function +.pmix_atomic_mb: + sync + blr + .size .pmix_atomic_mb, .-.pmix_atomic_mb + + + .globl .pmix_atomic_rmb + .type .pmix_atomic_rmb, @function +.pmix_atomic_rmb: + lwsync + blr + .size .pmix_atomic_rmb, .-.pmix_atomic_rmb + + + .globl .pmix_atomic_wmb + .type .pmix_atomic_wmb, @function +.pmix_atomic_wmb: + eieio + blr + .size .pmix_atomic_wmb, .-.pmix_atomic_wmb + + + .globl .pmix_atomic_cmpset_32 + .type .pmix_atomic_cmpset_32, @function +.pmix_atomic_cmpset_32: + .L1: lwarx 0, 0, 3 + cmpw 0, 0, 4 + bne- .L2 + stwcx. 5, 0, 3 + bne- .L1 + .L2: + cmpw 7,0,4 + mfcr 3 + rlwinm 3,3,31,1 + blr + .size .pmix_atomic_cmpset_32, .-.pmix_atomic_cmpset_32 + + + .globl .pmix_atomic_cmpset_acq_32 + .type .pmix_atomic_cmpset_acq_32, @function +.pmix_atomic_cmpset_acq_32: + mflr 0 + std 29,-24(1) + std 0,16(1) + stdu 1,-144(1) + bl .pmix_atomic_cmpset_32 + mr 29,3 + bl .pmix_atomic_rmb + mr 3,29 + addi 1,1,144 + ld 0,16(1) + mtlr 0 + ld 29,-24(1) + blr + .size .pmix_atomic_cmpset_acq_32, .-.pmix_atomic_cmpset_acq_32 + + + .globl .pmix_atomic_cmpset_rel_32 + .type .pmix_atomic_cmpset_rel_32, @function +.pmix_atomic_cmpset_rel_32: + mflr 0 + std 27,-40(1) + std 28,-32(1) + std 29,-24(1) + std 0,16(1) + stdu 1,-160(1) + mr 29,3 + mr 28,4 + mr 27,5 + bl .pmix_atomic_wmb + mr 3,29 + mr 4,28 + mr 5,27 + bl .pmix_atomic_cmpset_32 + addi 1,1,160 + ld 0,16(1) + mtlr 0 + ld 27,-40(1) + ld 28,-32(1) + ld 29,-24(1) + blr + .size .pmix_atomic_cmpset_rel_32, .-.pmix_atomic_cmpset_rel_32 + + + .globl .pmix_atomic_cmpset_64 + .type .pmix_atomic_cmpset_64, @function +.pmix_atomic_cmpset_64: + .L3: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- .L4 + stdcx. 5, 0, 3 + bne- .L3 + .L4: + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + .size .pmix_atomic_cmpset_64, .-.pmix_atomic_cmpset_64 + + + .globl .pmix_atomic_cmpset_acq_64 + .type .pmix_atomic_cmpset_acq_64, @function +.pmix_atomic_cmpset_acq_64: + .L7: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- .L8 + stdcx. 5, 0, 3 + bne- .L7 + .L8: + lwsync + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + .size .pmix_atomic_cmpset_acq_64, .-.pmix_atomic_cmpset_acq_64 + + + .globl .pmix_atomic_cmpset_rel_64 + .type .pmix_atomic_cmpset_rel_64, @function +.pmix_atomic_cmpset_rel_64: + eieio + .L9: ldarx 0, 0, 3 + cmpd 0, 0, 4 + bne- .L10 + stdcx. 5, 0, 3 + bne- .L9 + .L10: + xor 3,4,0 + subfic 5,3,0 + adde 3,5,3 + blr + .size .pmix_atomic_cmpset_rel_64, .-.pmix_atomic_cmpset_rel_64 + + + .globl .pmix_atomic_add_32 + .type .pmix_atomic_add_32, @function +.pmix_atomic_add_32: + .L5: lwarx 0, 0, 3 + add 0, 4, 0 + stwcx. 0, 0, 3 + bne- .L5 + + mr 3,0 + blr + .size .pmix_atomic_add_32, .-.pmix_atomic_add_32 + + + .globl .pmix_atomic_sub_32 + .type .pmix_atomic_sub_32, @function +.pmix_atomic_sub_32: + .L6: lwarx 0,0,3 + subf 0,4,0 + stwcx. 0,0,3 + bne- .L6 + + mr 3,0 + blr + .size .pmix_atomic_sub_32, .-.pmix_atomic_sub_32 + + .globl .pmix_sys_timer_get_cycles + .type .pmix_sys_timer_get_cycles, @function +.pmix_sys_timer_get_cycles: + .L11: + mftbu 2 + rldicl 2,2,0,32 + mftb 0 + rldicl 9,0,0,32 + mftbu 0 + rldicl 0,0,0,32 + cmpw 7,0,2 + bne 7,.L11 + sldi 3,0,32 + or 3,3,9 + blr + .size .pmix_sys_timer_get_cycles, .-.pmix_sys_timer_get_cycles + + .section .note.GNU-stack,"",@progbits diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-osx.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-osx.s new file mode 100644 index 00000000000..3a29e67e018 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-powerpc64-osx.s @@ -0,0 +1,156 @@ + .text + + .align 2 + .globl _pmix_atomic_mb +_pmix_atomic_mb: + sync + blr + + + .globl _pmix_atomic_rmb +_pmix_atomic_rmb: + lwsync + blr + + + .globl _pmix_atomic_wmb +_pmix_atomic_wmb: + eieio + blr + + + .globl _pmix_atomic_cmpset_32 +_pmix_atomic_cmpset_32: + L1: lwarx r0, 0, r3 + cmpw 0, r0, r4 + bne- L2 + stwcx. r5, 0, r3 + bne- L1 + L2: + cmpw cr7,r0,r4 + mfcr r3 + rlwinm r3,r3,31,1 + blr + + + .globl _pmix_atomic_cmpset_acq_32 +_pmix_atomic_cmpset_acq_32: + mflr r0 + std r29,-24(r1) + std r0,16(r1) + stdu r1,-144(r1) + bl _pmix_atomic_cmpset_32 + mr r29,r3 + bl _pmix_atomic_rmb + mr r3,r29 + addi r1,r1,144 + ld r0,16(r1) + mtlr r0 + ld r29,-24(r1) + blr + + + .globl _pmix_atomic_cmpset_rel_32 +_pmix_atomic_cmpset_rel_32: + mflr r0 + std r27,-40(r1) + std r28,-32(r1) + std r29,-24(r1) + std r0,16(r1) + stdu r1,-160(r1) + mr r29,r3 + mr r28,r4 + mr r27,r5 + bl _pmix_atomic_wmb + mr r3,r29 + mr r4,r28 + mr r5,r27 + bl _pmix_atomic_cmpset_32 + addi r1,r1,160 + ld r0,16(r1) + mtlr r0 + ld r27,-40(r1) + ld r28,-32(r1) + ld r29,-24(r1) + blr + + + .globl _pmix_atomic_cmpset_64 +_pmix_atomic_cmpset_64: + L3: ldarx r0, 0, r3 + cmpd 0, r0, r4 + bne- L4 + stdcx. r5, 0, r3 + bne- L3 + L4: + xor r3,r4,r0 + subfic r5,r3,0 + adde r3,r5,r3 + blr + + + .globl _pmix_atomic_cmpset_acq_64 +_pmix_atomic_cmpset_acq_64: + L7: ldarx r0, 0, r3 + cmpd 0, r0, r4 + bne- L8 + stdcx. r5, 0, r3 + bne- L7 + L8: + lwsync + xor r3,r4,r0 + subfic r5,r3,0 + adde r3,r5,r3 + blr + + + .globl _pmix_atomic_cmpset_rel_64 +_pmix_atomic_cmpset_rel_64: + eieio + L9: ldarx r0, 0, r3 + cmpd 0, r0, r4 + bne- L10 + stdcx. r5, 0, r3 + bne- L9 + L10: + xor r3,r4,r0 + subfic r5,r3,0 + adde r3,r5,r3 + blr + + + .globl _pmix_atomic_add_32 +_pmix_atomic_add_32: + L5: lwarx r0, 0, r3 + add r0, r4, r0 + stwcx. r0, 0, r3 + bne- L5 + + mr r3,r0 + blr + + + .globl _pmix_atomic_sub_32 +_pmix_atomic_sub_32: + L6: lwarx r0,0,r3 + subf r0,r4,r0 + stwcx. r0,0,r3 + bne- L6 + + mr r3,r0 + blr + + .globl _pmix_sys_timer_get_cycles +_pmix_sys_timer_get_cycles: + L11: + mftbu r2 + rldicl r2,r2,0,32 + mftb r0 + rldicl r9,r0,0,32 + mftbu r0 + rldicl r0,r0,0,32 + cmpw cr7,r0,r2 + bne cr7,L11 + sldi r3,r0,32 + or r3,r3,r9 + blr diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-sparcv9-32-solaris.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-sparcv9-32-solaris.s new file mode 100644 index 00000000000..3fb48494f6e --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-sparcv9-32-solaris.s @@ -0,0 +1,190 @@ + .text + + .align 4 + + + .globl pmix_atomic_mb + .type pmix_atomic_mb, #function +pmix_atomic_mb: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad + retl + nop + .size pmix_atomic_mb, .-pmix_atomic_mb + + + .globl pmix_atomic_rmb + .type pmix_atomic_rmb, #function +pmix_atomic_rmb: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #LoadLoad + retl + nop + .size pmix_atomic_rmb, .-pmix_atomic_rmb + + + .globl pmix_atomic_wmb + .type pmix_atomic_wmb, #function +pmix_atomic_wmb: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + retl + nop + .size pmix_atomic_wmb, .-pmix_atomic_wmb + + + .globl pmix_atomic_cmpset_32 + .type pmix_atomic_cmpset_32, #function +pmix_atomic_cmpset_32: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + retl + subx %g0, -1, %o0 + .size pmix_atomic_cmpset_32, .-pmix_atomic_cmpset_32 + + + .globl pmix_atomic_cmpset_acq_32 + .type pmix_atomic_cmpset_acq_32, #function +pmix_atomic_cmpset_acq_32: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + subx %g0, -1, %o0 + membar #LoadLoad + retl + sra %o0, 0, %o0 + .size pmix_atomic_cmpset_acq_32, .-pmix_atomic_cmpset_acq_32 + + + .globl pmix_atomic_cmpset_rel_32 + .type pmix_atomic_cmpset_rel_32, #function +pmix_atomic_cmpset_rel_32: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + retl + subx %g0, -1, %o0 + .size pmix_atomic_cmpset_rel_32, .-pmix_atomic_cmpset_rel_32 + + + .globl pmix_atomic_cmpset_64 + .type pmix_atomic_cmpset_64, #function +pmix_atomic_cmpset_64: + !#PROLOGUE# 0 + save %sp, -128, %sp + !#PROLOGUE# 1 + mov %i3, %o4 + mov %i4, %o5 + st %i1, [%fp-32] + st %i2, [%fp-28] + std %o4, [%fp-24] + ldx [%fp-24], %g1 + ldx [%fp-32], %g2 + casxa [%i0] 0x80, %g2, %g1 + stx %g1, [%fp-24] + + ld [%fp-24], %i5 + ld [%fp-32], %g1 + cmp %i5, %g1 + bne .L12 + mov 0, %i0 + ld [%fp-20], %i2 + ld [%fp-28], %i1 + cmp %i2, %i1 + be,a .L12 + mov 1, %i0 +.L12: + ret + restore + .size pmix_atomic_cmpset_64, .-pmix_atomic_cmpset_64 + + + .globl pmix_atomic_cmpset_acq_64 + .type pmix_atomic_cmpset_acq_64, #function +pmix_atomic_cmpset_acq_64: + !#PROLOGUE# 0 + save %sp, -128, %sp + !#PROLOGUE# 1 + mov %i1, %o4 + mov %i2, %o5 + mov %i3, %o2 + mov %i4, %o3 + std %o4, [%fp-32] + std %o2, [%fp-24] + ldx [%fp-24], %g1 + ldx [%fp-32], %g2 + casxa [%i0] 0x80, %g2, %g1 + stx %g1, [%fp-24] + + ld [%fp-24], %i5 + ld [%fp-32], %g1 + cmp %i5, %g1 + bne .L16 + mov 0, %i0 + ld [%fp-20], %i2 + ld [%fp-28], %i1 + cmp %i2, %i1 + be,a .L16 + mov 1, %i0 +.L16: + membar #LoadLoad + ret + restore + .size pmix_atomic_cmpset_acq_64, .-pmix_atomic_cmpset_acq_64 + + + .globl pmix_atomic_cmpset_rel_64 + .type pmix_atomic_cmpset_rel_64, #function +pmix_atomic_cmpset_rel_64: + !#PROLOGUE# 0 + save %sp, -128, %sp + !#PROLOGUE# 1 + mov %i1, %o4 + mov %i2, %o5 + mov %i3, %o2 + mov %i4, %o3 + membar #StoreStore + std %o4, [%fp-32] + std %o2, [%fp-24] + ldx [%fp-24], %g1 + ldx [%fp-32], %g2 + casxa [%i0] 0x80, %g2, %g1 + stx %g1, [%fp-24] + + ld [%fp-24], %i5 + ld [%fp-32], %g1 + cmp %i5, %g1 + bne .L21 + mov 0, %i0 + ld [%fp-20], %i2 + ld [%fp-28], %i1 + cmp %i2, %i1 + be,a .L21 + mov 1, %i0 +.L21: + ret + restore + .size pmix_atomic_cmpset_rel_64, .-pmix_atomic_cmpset_rel_64 + + + .globl pmix_sys_timer_get_cycles + .type pmix_sys_timer_get_cycles, #function +pmix_sys_timer_get_cycles: + save %sp,-96,%sp + rd %tick,%o0 + srlx %o0,32,%o1 + or %g0,%o1,%i0 + ret ! Result = %i0 + restore %o0,0,%o1 + .size pmix_sys_timer_get_cycles, .-pmix_sys_timer_get_cycles diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-sparcv9-64-solaris.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-sparcv9-64-solaris.s new file mode 100644 index 00000000000..7aae1cb8ed3 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-sparcv9-64-solaris.s @@ -0,0 +1,130 @@ + .text + + .align 4 + + + .globl pmix_atomic_mb + .type pmix_atomic_mb, #function +pmix_atomic_mb: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad + retl + nop + .size pmix_atomic_mb, .-pmix_atomic_mb + + + .globl pmix_atomic_rmb + .type pmix_atomic_rmb, #function +pmix_atomic_rmb: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #LoadLoad + retl + nop + .size pmix_atomic_rmb, .-pmix_atomic_rmb + + + .globl pmix_atomic_wmb + .type pmix_atomic_wmb, #function +pmix_atomic_wmb: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + retl + nop + .size pmix_atomic_wmb, .-pmix_atomic_wmb + + + .globl pmix_atomic_cmpset_32 + .type pmix_atomic_cmpset_32, #function +pmix_atomic_cmpset_32: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + retl + subx %g0, -1, %o0 + .size pmix_atomic_cmpset_32, .-pmix_atomic_cmpset_32 + + + .globl pmix_atomic_cmpset_acq_32 + .type pmix_atomic_cmpset_acq_32, #function +pmix_atomic_cmpset_acq_32: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + subx %g0, -1, %o0 + membar #LoadLoad + retl + sra %o0, 0, %o0 + .size pmix_atomic_cmpset_acq_32, .-pmix_atomic_cmpset_acq_32 + + + .globl pmix_atomic_cmpset_rel_32 + .type pmix_atomic_cmpset_rel_32, #function +pmix_atomic_cmpset_rel_32: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + casa [%o0] 0x80, %o1, %o2 + xor %o2, %o1, %o2 + subcc %g0, %o2, %g0 + retl + subx %g0, -1, %o0 + .size pmix_atomic_cmpset_rel_32, .-pmix_atomic_cmpset_rel_32 + + + .globl pmix_atomic_cmpset_64 + .type pmix_atomic_cmpset_64, #function +pmix_atomic_cmpset_64: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casxa [%o0] 0x80, %o1, %o2 + mov 0, %o0 + xor %o2, %o1, %o2 + retl + movre %o2, 1, %o0 + .size pmix_atomic_cmpset_64, .-pmix_atomic_cmpset_64 + + + .globl pmix_atomic_cmpset_acq_64 + .type pmix_atomic_cmpset_acq_64, #function +pmix_atomic_cmpset_acq_64: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + casxa [%o0] 0x80, %o1, %o2 + mov 0, %o0 + xor %o2, %o1, %o2 + movre %o2, 1, %o0 + membar #LoadLoad + retl + sra %o0, 0, %o0 + .size pmix_atomic_cmpset_acq_64, .-pmix_atomic_cmpset_acq_64 + + + .globl pmix_atomic_cmpset_rel_64 + .type pmix_atomic_cmpset_rel_64, #function +pmix_atomic_cmpset_rel_64: + !#PROLOGUE# 0 + !#PROLOGUE# 1 + membar #StoreStore + casxa [%o0] 0x80, %o1, %o2 + mov 0, %o0 + xor %o2, %o1, %o2 + retl + movre %o2, 1, %o0 + .size pmix_atomic_cmpset_rel_64, .-pmix_atomic_cmpset_rel_64 + + + .globl pmix_sys_timer_get_cycles + .type pmix_sys_timer_get_cycles, #function +pmix_sys_timer_get_cycles: + save %sp,-176,%sp + rd %tick,%o0 + ret ! Result = %i0 + restore %o0,0,%o0 + .size pmix_sys_timer_get_cycles, .-pmix_sys_timer_get_cycles diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-x86_64-linux-nongas.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-x86_64-linux-nongas.s new file mode 100644 index 00000000000..85d19d9b1e2 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-x86_64-linux-nongas.s @@ -0,0 +1,63 @@ + .text + + .globl pmix_atomic_mb + .type pmix_atomic_mb, @function +pmix_atomic_mb: + pushq %rbp + movq %rsp, %rbp + leave + ret + .size pmix_atomic_mb, .-pmix_atomic_mb + + + .globl pmix_atomic_rmb + .type pmix_atomic_rmb, @function +pmix_atomic_rmb: + pushq %rbp + movq %rsp, %rbp + leave + ret + .size pmix_atomic_rmb, .-pmix_atomic_rmb + + + .globl pmix_atomic_wmb + .type pmix_atomic_wmb, @function +pmix_atomic_wmb: + pushq %rbp + movq %rsp, %rbp + leave + ret + .size pmix_atomic_wmb, .-pmix_atomic_wmb + + + .globl pmix_atomic_cmpset_32 + .type pmix_atomic_cmpset_32, @function +pmix_atomic_cmpset_32: + movl %esi, %eax + lock; cmpxchgl %edx,(%rdi) + sete %dl + movzbl %dl, %eax + ret + .size pmix_atomic_cmpset_32, .-pmix_atomic_cmpset_32 + + + .globl pmix_atomic_cmpset_64 + .type pmix_atomic_cmpset_64, @function +pmix_atomic_cmpset_64: + movq %rsi, %rax + lock; cmpxchgq %rdx,(%rdi) + sete %dl + movzbl %dl, %eax + ret + .size pmix_atomic_cmpset_64, .-pmix_atomic_cmpset_64 + + + .globl pmix_sys_timer_get_cycles + .type pmix_sys_timer_get_cycles, @function +pmix_sys_timer_get_cycles: + rdtsc + salq $32, %rdx + mov %eax, %eax + orq %rdx, %rax + ret + .size pmix_sys_timer_get_cycles, .-pmix_sys_timer_get_cycles diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-x86_64-linux.s b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-x86_64-linux.s new file mode 100644 index 00000000000..f60867c1ab8 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/asm/generated/atomic-x86_64-linux.s @@ -0,0 +1,65 @@ + .text + + .globl pmix_atomic_mb + .type pmix_atomic_mb, @function +pmix_atomic_mb: + pushq %rbp + movq %rsp, %rbp + leave + ret + .size pmix_atomic_mb, .-pmix_atomic_mb + + + .globl pmix_atomic_rmb + .type pmix_atomic_rmb, @function +pmix_atomic_rmb: + pushq %rbp + movq %rsp, %rbp + leave + ret + .size pmix_atomic_rmb, .-pmix_atomic_rmb + + + .globl pmix_atomic_wmb + .type pmix_atomic_wmb, @function +pmix_atomic_wmb: + pushq %rbp + movq %rsp, %rbp + leave + ret + .size pmix_atomic_wmb, .-pmix_atomic_wmb + + + .globl pmix_atomic_cmpset_32 + .type pmix_atomic_cmpset_32, @function +pmix_atomic_cmpset_32: + movl %esi, %eax + lock; cmpxchgl %edx,(%rdi) + sete %dl + movzbl %dl, %eax + ret + .size pmix_atomic_cmpset_32, .-pmix_atomic_cmpset_32 + + + .globl pmix_atomic_cmpset_64 + .type pmix_atomic_cmpset_64, @function +pmix_atomic_cmpset_64: + movq %rsi, %rax + lock; cmpxchgq %rdx,(%rdi) + sete %dl + movzbl %dl, %eax + ret + .size pmix_atomic_cmpset_64, .-pmix_atomic_cmpset_64 + + + .globl pmix_sys_timer_get_cycles + .type pmix_sys_timer_get_cycles, @function +pmix_sys_timer_get_cycles: + rdtsc + salq $32, %rdx + mov %eax, %eax + orq %rdx, %rax + ret + .size pmix_sys_timer_get_cycles, .-pmix_sys_timer_get_cycles + + .section .note.GNU-stack,"",@progbits diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/Makefile.include new file mode 100644 index 00000000000..9f677e5e44c --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/Makefile.include @@ -0,0 +1,44 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2016 Los A.includeos National Security, LLC. All rights +# reserved. +# Copyright (c) 2017 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.include does not stand on its own - it is included from src/Makefile.am + +headers += \ + atomics/sys/architecture.h \ + atomics/sys/atomic.h \ + atomics/sys/atomic_impl.h \ + atomics/sys/timer.h \ + atomics/sys/cma.h + +include atomics/sys/x86_64/Makefile.include +include atomics/sys/arm/Makefile.include +include atomics/sys/arm64/Makefile.include +include atomics/sys/ia32/Makefile.include +include atomics/sys/ia64/Makefile.include +include atomics/sys/mips/Makefile.include +include atomics/sys/powerpc/Makefile.include +include atomics/sys/sparcv9/Makefile.include +include atomics/sys/sync_builtin/Makefile.include +include atomics/sys/gcc_builtin/Makefile.include diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/architecture.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/architecture.h new file mode 100644 index 00000000000..244c966a164 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/architecture.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * List of supported architectures + */ + +#ifndef PMIX_SYS_ARCHITECTURE_H +#define PMIX_SYS_ARCHITECTURE_H + +/* Architectures */ +#define PMIX_UNSUPPORTED 0000 +#define PMIX_IA32 0010 +#define PMIX_IA64 0020 +#define PMIX_X86_64 0030 +#define PMIX_POWERPC32 0050 +#define PMIX_POWERPC64 0051 +#define PMIX_SPARC 0060 +#define PMIX_SPARCV9_32 0061 +#define PMIX_SPARCV9_64 0062 +#define PMIX_MIPS 0070 +#define PMIX_ARM 0100 +#define PMIX_ARM64 0101 +#define PMIX_S390 0110 +#define PMIX_S390X 0111 +#define PMIX_BUILTIN_SYNC 0200 +#define PMIX_BUILTIN_GCC 0202 +#define PMIX_BUILTIN_NO 0203 + +/* Formats */ +#define PMIX_DEFAULT 1000 /* standard for given architecture */ +#define PMIX_DARWIN 1001 /* Darwin / OS X on PowerPC */ +#define PMIX_PPC_LINUX 1002 /* Linux on PowerPC */ +#define PMIX_AIX 1003 /* AIX on Power / PowerPC */ + +#endif /* #ifndef PMIX_SYS_ARCHITECTURE_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/Makefile.include new file mode 100644 index 00000000000..e25774e7fcb --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/Makefile.include @@ -0,0 +1,24 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2008 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from src/atomics/sys/include/Makefile.include + +headers += \ + atomics/sys/arm/atomic.h \ + atomics/sys/arm/timer.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/atomic.h new file mode 100644 index 00000000000..1ee246252a9 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/atomic.h @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2010 ARM ltd. All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * ARMv5 and earlier lack robust atomic operations and therefore this file uses + * Linux kernel support where needed. The kernel also provides memory barriers + * and this file uses them for ARMv5 and earlier processors, which lack the + * memory barrier instruction. These kernel functions are available on kernel + * versions 2.6.15 and greater; using them will result in undefined behavior on + * older kernels. + * See Documentation/arm/kernel_user_helpers.txt in the kernel tree for details + */ + +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +#if (PMIX_ASM_ARM_VERSION >= 7) + +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 +/* use the DMB instruction if available... */ + +#define PMIXMB() __asm__ __volatile__ ("dmb" : : : "memory") +#define PMIXRMB() __asm__ __volatile__ ("dmb" : : : "memory") +#define PMIXWMB() __asm__ __volatile__ ("dmb" : : : "memory") + +#elif (PMIX_ASM_ARM_VERSION == 6) + +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 +/* ...or the v6-specific equivalent... */ + +#define PMIXMB() __asm__ __volatile__ ("mcr p15, 0, r0, c7, c10, 5" : : : "memory") +#define PMIXRMB() MB() +#define PMIXWMB() MB() + +#else + +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 +/* ...otherwise use the Linux kernel-provided barrier */ + +#define PMIXMB() (*((void (*)(void))(0xffff0fa0)))() +#define PMIXRMB() MB() +#define PMIXWMB() MB() + +#endif + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ + +#if (PMIX_HAVE_ATOMIC_MEM_BARRIER == 1) + +static inline +void pmix_atomic_mb(void) +{ + PMIXMB(); +} + + +static inline +void pmix_atomic_rmb(void) +{ + PMIXRMB(); +} + + +static inline +void pmix_atomic_wmb(void) +{ + PMIXWMB(); +} + +static inline +void pmix_atomic_isync(void) +{ +} + +#endif + + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ + +#if (PMIX_GCC_INLINE_ASSEMBLY && (PMIX_ASM_ARM_VERSION >= 6)) + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 +#define PMIX_HAVE_ATOMIC_MATH_32 1 +static inline int pmix_atomic_cmpset_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int32_t ret, tmp; + + __asm__ __volatile__ ( + "1: ldrex %0, [%2] \n" + " cmp %0, %3 \n" + " bne 2f \n" + " strex %1, %4, [%2] \n" + " cmp %1, #0 \n" + " bne 1b \n" + "2: \n" + + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_32 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_32(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_32(addr, oldval, newval); +} + +#if (PMIX_ASM_SUPPORT_64BIT == 1) + +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 +static inline int pmix_atomic_cmpset_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int64_t ret; + int tmp; + + + __asm__ __volatile__ ( + "1: ldrexd %0, %H0, [%2] \n" + " cmp %0, %3 \n" + " it eq \n" + " cmpeq %H0, %H3 \n" + " bne 2f \n" + " strexd %1, %4, %H4, [%2] \n" + " cmp %1, #0 \n" + " bne 1b \n" + "2: \n" + + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_64 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_64(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_64(addr, oldval, newval); +} + +#endif + + +#define PMIX_HAVE_ATOMIC_ADD_32 1 +static inline int32_t pmix_atomic_add_32(volatile int32_t* v, int inc) +{ + int32_t t; + int tmp; + + __asm__ __volatile__( + "1: ldrex %0, [%2] \n" + " add %0, %0, %3 \n" + " strex %1, %0, [%2] \n" + " cmp %1, #0 \n" + " bne 1b \n" + + : "=&r" (t), "=&r" (tmp) + : "r" (v), "r" (inc) + : "cc", "memory"); + + + return t; +} + +#define PMIX_HAVE_ATOMIC_SUB_32 1 +static inline int32_t pmix_atomic_sub_32(volatile int32_t* v, int dec) +{ + int32_t t; + int tmp; + + __asm__ __volatile__( + "1: ldrex %0, [%2] \n" + " sub %0, %0, %3 \n" + " strex %1, %0, [%2] \n" + " cmp %1, #0 \n" + " bne 1b \n" + + : "=&r" (t), "=&r" (tmp) + : "r" (v), "r" (dec) + : "cc", "memory"); + + return t; +} + +#else /* PMIX_ASM_ARM_VERSION <=5 or no GCC inline assembly */ + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 +#define __kuser_cmpxchg (*((int (*)(int, int, volatile int*))(0xffff0fc0))) +static inline int pmix_atomic_cmpset_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + return !(__kuser_cmpxchg(oldval, newval, addr)); +} + +static inline int pmix_atomic_cmpset_acq_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + /* kernel function includes all necessary memory barriers */ + return pmix_atomic_cmpset_32(addr, oldval, newval); +} + +static inline int pmix_atomic_cmpset_rel_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + /* kernel function includes all necessary memory barriers */ + return pmix_atomic_cmpset_32(addr, oldval, newval); +} + +#endif + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/timer.h new file mode 100644 index 00000000000..65532ac8a77 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm/timer.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_TIMER_H +#define PMIX_SYS_ARCH_TIMER_H 1 + +#include + +typedef uint64_t pmix_timer_t; + +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + pmix_timer_t ret; + struct tms accurate_clock; + + times(&accurate_clock); + ret = accurate_clock.tms_utime + accurate_clock.tms_stime; + + return ret; +} + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#endif /* ! PMIX_SYS_ARCH_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/Makefile.include new file mode 100644 index 00000000000..980c5fed3bd --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/Makefile.include @@ -0,0 +1,24 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2008 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/arm64/atomic.h \ + atomics/sys/arm64/timer.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/atomic.h new file mode 100644 index 00000000000..c48c9143d36 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/atomic.h @@ -0,0 +1,302 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2010 ARM ltd. All rights reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(PMIX_SYS_ARCH_ATOMIC_H) + +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +#if PMIX_GCC_INLINE_ASSEMBLY + +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 +#define PMIX_HAVE_ATOMIC_LLSC_32 1 +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 +#define PMIX_HAVE_ATOMIC_SWAP_32 1 +#define PMIX_HAVE_ATOMIC_MATH_32 1 +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 +#define PMIX_HAVE_ATOMIC_SWAP_64 1 +#define PMIX_HAVE_ATOMIC_LLSC_64 1 +#define PMIX_HAVE_ATOMIC_ADD_32 1 +#define PMIX_HAVE_ATOMIC_SUB_32 1 +#define PMIX_HAVE_ATOMIC_ADD_64 1 +#define PMIX_HAVE_ATOMIC_SUB_64 1 + +#define PMIXMB() __asm__ __volatile__ ("dmb sy" : : : "memory") +#define PMIXRMB() __asm__ __volatile__ ("dmb ld" : : : "memory") +#define PMIXWMB() __asm__ __volatile__ ("dmb st" : : : "memory") + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ + +static inline void pmix_atomic_mb (void) +{ + PMIXMB(); +} + +static inline void pmix_atomic_rmb (void) +{ + PMIXRMB(); +} + +static inline void pmix_atomic_wmb (void) +{ + PMIXWMB(); +} + +static inline void pmix_atomic_isync (void) +{ + __asm__ __volatile__ ("isb"); +} + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ + +static inline int pmix_atomic_cmpset_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int32_t ret, tmp; + + __asm__ __volatile__ ("1: ldaxr %w0, [%2] \n" + " cmp %w0, %w3 \n" + " bne 2f \n" + " stxr %w1, %w4, [%2] \n" + " cbnz %w1, 1b \n" + "2: \n" + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + +static inline int32_t pmix_atomic_swap_32(volatile int32_t *addr, int32_t newval) +{ + int32_t ret, tmp; + + __asm__ __volatile__ ("1: ldaxr %w0, [%2] \n" + " stlxr %w1, %w3, [%2] \n" + " cbnz %w1, 1b \n" + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (newval) + : "cc", "memory"); + + return ret; +} + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_32 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int32_t ret, tmp; + + __asm__ __volatile__ ("1: ldaxr %w0, [%2] \n" + " cmp %w0, %w3 \n" + " bne 2f \n" + " stxr %w1, %w4, [%2] \n" + " cbnz %w1, 1b \n" + "2: \n" + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + + +static inline int pmix_atomic_cmpset_rel_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int32_t ret, tmp; + + __asm__ __volatile__ ("1: ldxr %w0, [%2] \n" + " cmp %w0, %w3 \n" + " bne 2f \n" + " stlxr %w1, %w4, [%2] \n" + " cbnz %w1, 1b \n" + "2: \n" + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + +static inline int32_t pmix_atomic_ll_32 (volatile int32_t *addr) +{ + int32_t ret; + + __asm__ __volatile__ ("ldaxr %w0, [%1] \n" + : "=&r" (ret) + : "r" (addr)); + + return ret; +} + +static inline int pmix_atomic_sc_32 (volatile int32_t *addr, int32_t newval) +{ + int ret; + + __asm__ __volatile__ ("stlxr %w0, %w2, [%1] \n" + : "=&r" (ret) + : "r" (addr), "r" (newval) + : "cc", "memory"); + + return ret == 0; +} + +static inline int pmix_atomic_cmpset_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int64_t ret; + int tmp; + + __asm__ __volatile__ ("1: ldaxr %0, [%2] \n" + " cmp %0, %3 \n" + " bne 2f \n" + " stxr %w1, %4, [%2] \n" + " cbnz %w1, 1b \n" + "2: \n" + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + +static inline int64_t pmix_atomic_swap_64 (volatile int64_t *addr, int64_t newval) +{ + int64_t ret; + int tmp; + + __asm__ __volatile__ ("1: ldaxr %0, [%2] \n" + " stlxr %w1, %3, [%2] \n" + " cbnz %w1, 1b \n" + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (newval) + : "cc", "memory"); + + return ret; +} + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_64 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int64_t ret; + int tmp; + + __asm__ __volatile__ ("1: ldaxr %0, [%2] \n" + " cmp %0, %3 \n" + " bne 2f \n" + " stxr %w1, %4, [%2] \n" + " cbnz %w1, 1b \n" + "2: \n" + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + + +static inline int pmix_atomic_cmpset_rel_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int64_t ret; + int tmp; + + __asm__ __volatile__ ("1: ldxr %0, [%2] \n" + " cmp %0, %3 \n" + " bne 2f \n" + " stlxr %w1, %4, [%2] \n" + " cbnz %w1, 1b \n" + "2: \n" + : "=&r" (ret), "=&r" (tmp) + : "r" (addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + +static inline int64_t pmix_atomic_ll_64 (volatile int64_t *addr) +{ + int64_t ret; + + __asm__ __volatile__ ("ldaxr %0, [%1] \n" + : "=&r" (ret) + : "r" (addr)); + + return ret; +} + +static inline int pmix_atomic_sc_64 (volatile int64_t *addr, int64_t newval) +{ + int ret; + + __asm__ __volatile__ ("stlxr %w0, %2, [%1] \n" + : "=&r" (ret) + : "r" (addr), "r" (newval) + : "cc", "memory"); + + return ret == 0; +} + +#define PMIX_ASM_MAKE_ATOMIC(type, bits, name, inst, reg) \ + static inline type pmix_atomic_ ## name ## _ ## bits (volatile type *addr, type value) \ + { \ + type newval; \ + int32_t tmp; \ + \ + __asm__ __volatile__("1: ldxr %" reg "0, [%2] \n" \ + " " inst " %" reg "0, %" reg "0, %" reg "3 \n" \ + " stxr %w1, %" reg "0, [%2] \n" \ + " cbnz %w1, 1b \n" \ + : "=&r" (newval), "=&r" (tmp) \ + : "r" (addr), "r" (value) \ + : "cc", "memory"); \ + \ + return newval; \ + } + +PMIX_ASM_MAKE_ATOMIC(int32_t, 32, add, "add", "w") +PMIX_ASM_MAKE_ATOMIC(int32_t, 32, sub, "sub", "w") +PMIX_ASM_MAKE_ATOMIC(int64_t, 64, add, "add", "") +PMIX_ASM_MAKE_ATOMIC(int64_t, 64, sub, "sub", "") + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/timer.h new file mode 100644 index 00000000000..bacc4b919eb --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/arm64/timer.h @@ -0,0 +1,46 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2016 Broadcom Limited. All rights reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_TIMER_H +#define PMIX_SYS_ARCH_TIMER_H 1 + +#include + +typedef uint64_t pmix_timer_t; + +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + pmix_timer_t ret; + + __asm__ __volatile__ ("isb" ::: "memory"); + __asm__ __volatile__ ("mrs %0, CNTVCT_EL0" : "=r" (ret)); + + return ret; +} + + +static inline pmix_timer_t +pmix_sys_timer_freq(void) +{ + pmix_timer_t freq; + __asm__ __volatile__ ("mrs %0, CNTFRQ_EL0" : "=r" (freq)); + return (pmix_timer_t)(freq); +} + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#endif /* ! PMIX_SYS_ARCH_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/atomic.h new file mode 100644 index 00000000000..e18d2cb1a42 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/atomic.h @@ -0,0 +1,623 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file + * + * Atomic operations. + * + * This API is patterned after the FreeBSD kernel atomic interface + * (which is influenced by Intel's ia64 architecture). The + * FreeBSD interface is documented at + * + * http://www.freebsd.org/cgi/man.cgi?query=atomic&sektion=9 + * + * Only the necessary subset of functions are implemented here. + * + * The following #defines will be true / false based on + * assembly support: + * + * - \c PMIX_HAVE_ATOMIC_MEM_BARRIER atomic memory barriers + * - \c PMIX_HAVE_ATOMIC_SPINLOCKS atomic spinlocks + * - \c PMIX_HAVE_ATOMIC_MATH_32 if 32 bit add/sub/cmpset can be done "atomicly" + * - \c PMIX_HAVE_ATOMIC_MATH_64 if 64 bit add/sub/cmpset can be done "atomicly" + * + * Note that for the Atomic math, atomic add/sub may be implemented as + * C code using pmix_atomic_cmpset. The appearance of atomic + * operation will be upheld in these cases. + */ + +#ifndef PMIX_SYS_ATOMIC_H +#define PMIX_SYS_ATOMIC_H 1 + +#include "pmix_config.h" + +#include "src/atomics/sys/architecture.h" +#include "src/include/pmix_stdint.h" + +/* do some quick #define cleanup in cases where we are doing + testing... */ +#ifdef PMIX_DISABLE_INLINE_ASM +#undef PMIX_C_GCC_INLINE_ASSEMBLY +#define PMIX_C_GCC_INLINE_ASSEMBLY 0 +#undef PMIX_C_DEC_INLINE_ASSEMBLY +#define PMIX_C_DEC_INLINE_ASSEMBLY 0 +#undef PMIX_C_XLC_INLINE_ASSEMBLY +#define PMIX_C_XLC_INLINE_ASSEMBLY 0 +#endif + +/* define PMIX_{GCC,DEC,XLC}_INLINE_ASSEMBLY based on the + PMIX_C_{GCC,DEC,XLC}_INLINE_ASSEMBLY defines and whether we + are in C or C++ */ +#if defined(c_plusplus) || defined(__cplusplus) +/* We no longer support inline assembly for C++ as PMIX is a C-only interface */ +#define PMIX_GCC_INLINE_ASSEMBLY 0 +#define PMIX_DEC_INLINE_ASSEMBLY 0 +#define PMIX_XLC_INLINE_ASSEMBLY 0 +#else +#define PMIX_GCC_INLINE_ASSEMBLY PMIX_C_GCC_INLINE_ASSEMBLY +#define PMIX_DEC_INLINE_ASSEMBLY PMIX_C_DEC_INLINE_ASSEMBLY +#define PMIX_XLC_INLINE_ASSEMBLY PMIX_C_XLC_INLINE_ASSEMBLY +#endif + + +BEGIN_C_DECLS +/********************************************************************** + * + * Data structures for atomic ops + * + *********************************************************************/ +/** + * Volatile lock object (with optional padding). + * + * \note The internals of the lock are included here, but should be + * considered private. The implementation currently in use may choose + * to use an int or unsigned char as the lock value - the user is not + * informed either way. + */ +struct pmix_atomic_lock_t { + union { + volatile int32_t lock; /**< The lock address (an integer) */ + volatile unsigned char sparc_lock; /**< The lock address on sparc */ + char padding[sizeof(int)]; /**< Array for optional padding */ + } u; +}; +typedef struct pmix_atomic_lock_t pmix_atomic_lock_t; + +/********************************************************************** + * + * Set or unset these macros in the architecture-specific atomic.h + * files if we need to specify them as inline or non-inline + * + *********************************************************************/ +#if !PMIX_GCC_INLINE_ASSEMBLY +#define PMIX_HAVE_INLINE_ATOMIC_MEM_BARRIER 0 +#define PMIX_HAVE_INLINE_ATOMIC_CMPSET_32 0 +#define PMIX_HAVE_INLINE_ATOMIC_CMPSET_64 0 +#define PMIX_HAVE_INLINE_ATOMIC_ADD_32 0 +#define PMIX_HAVE_INLINE_ATOMIC_SUB_32 0 +#define PMIX_HAVE_INLINE_ATOMIC_ADD_64 0 +#define PMIX_HAVE_INLINE_ATOMIC_SUB_64 0 +#define PMIX_HAVE_INLINE_ATOMIC_SWAP_32 0 +#define PMIX_HAVE_INLINE_ATOMIC_SWAP_64 0 +#else +#define PMIX_HAVE_INLINE_ATOMIC_MEM_BARRIER 1 +#define PMIX_HAVE_INLINE_ATOMIC_CMPSET_32 1 +#define PMIX_HAVE_INLINE_ATOMIC_CMPSET_64 1 +#define PMIX_HAVE_INLINE_ATOMIC_ADD_32 1 +#define PMIX_HAVE_INLINE_ATOMIC_SUB_32 1 +#define PMIX_HAVE_INLINE_ATOMIC_ADD_64 1 +#define PMIX_HAVE_INLINE_ATOMIC_SUB_64 1 +#define PMIX_HAVE_INLINE_ATOMIC_SWAP_32 1 +#define PMIX_HAVE_INLINE_ATOMIC_SWAP_64 1 +#endif + +/** + * Enumeration of lock states + */ +enum { + PMIX_ATOMIC_UNLOCKED = 0, + PMIX_ATOMIC_LOCKED = 1 +}; + +/********************************************************************** + * + * Load the appropriate architecture files and set some reasonable + * default values for our support + * + *********************************************************************/ +#if defined(DOXYGEN) +/* don't include system-level gorp when generating doxygen files */ +#elif PMIX_ASSEMBLY_BUILTIN == PMIX_BUILTIN_SYNC +#include "src/atomics/sys/sync_builtin/atomic.h" +#elif PMIX_ASSEMBLY_BUILTIN == PMIX_BUILTIN_GCC +#include "src/atomics/sys/gcc_builtin/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_X86_64 +#include "src/atomics/sys/x86_64/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_ARM +#include "src/atomics/sys/arm/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_ARM64 +#include "src/atomics/sys/arm64/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_IA32 +#include "src/atomics/sys/ia32/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_IA64 +#include "src/atomics/sys/ia64/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_MIPS +#include "src/atomics/sys/mips/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_POWERPC32 +#include "src/atomics/sys/powerpc/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_POWERPC64 +#include "src/atomics/sys/powerpc/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_SPARC +#include "src/atomics/sys/sparc/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_SPARCV9_32 +#include "src/atomics/sys/sparcv9/atomic.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_SPARCV9_64 +#include "src/atomics/sys/sparcv9/atomic.h" +#endif + +#ifndef DOXYGEN +/* compare and set operations can't really be emulated from software, + so if these defines aren't already set, they should be set to 0 + now */ +#ifndef PMIX_HAVE_ATOMIC_CMPSET_32 +#define PMIX_HAVE_ATOMIC_CMPSET_32 0 +#endif +#ifndef PMIX_HAVE_ATOMIC_CMPSET_64 +#define PMIX_HAVE_ATOMIC_CMPSET_64 0 +#endif +#ifndef PMIX_HAVE_ATOMIC_CMPSET_128 +#define PMIX_HAVE_ATOMIC_CMPSET_128 0 +#endif +#ifndef PMIX_HAVE_ATOMIC_LLSC_32 +#define PMIX_HAVE_ATOMIC_LLSC_32 0 +#endif +#ifndef PMIX_HAVE_ATOMIC_LLSC_64 +#define PMIX_HAVE_ATOMIC_LLSC_64 0 +#endif +#endif /* DOXYGEN */ + +/********************************************************************** + * + * Memory Barriers - defined here if running doxygen or have barriers + * but can't inline + * + *********************************************************************/ +#if !defined(PMIX_HAVE_ATOMIC_MEM_BARRIER) && !defined(DOXYGEN) +/* no way to emulate in C code */ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 0 +#endif + +#if defined(DOXYGEN) || PMIX_HAVE_ATOMIC_MEM_BARRIER +/** + * Memory barrier + * + * Will use system-specific features to instruct the processor and + * memory controller that all writes and reads that have been posted + * before the call to \c pmix_atomic_mb() must appear to have + * completed before the next read or write. + * + * \note This can have some expensive side effects, including flushing + * the pipeline, preventing the cpu from reordering instructions, and + * generally grinding the memory controller's performance. Use only + * if you need *both* read and write barriers. + */ + +#if PMIX_HAVE_INLINE_ATOMIC_MEM_BARRIER +static inline +#endif +void pmix_atomic_mb(void); + +/** + * Read memory barrier + * + * Use system-specific features to instruct the processor and memory + * conrtoller that all reads that have been posted before the call to + * \c pmix_atomic_rmb() must appear to have been completed before the + * next read. Nothing is said about the ordering of writes when using + * \c pmix_atomic_rmb(). + */ + +#if PMIX_HAVE_INLINE_ATOMIC_MEM_BARRIER +static inline +#endif +void pmix_atomic_rmb(void); + +/** + * Write memory barrier. + * + * Use system-specific features to instruct the processor and memory + * conrtoller that all writes that have been posted before the call to + * \c pmix_atomic_wmb() must appear to have been completed before the + * next write. Nothing is said about the ordering of reads when using + * \c pmix_atomic_wmb(). + */ + +#if PMIX_HAVE_INLINE_ATOMIC_MEM_BARRIER +static inline +#endif +void pmix_atomic_wmb(void); + +#endif /* defined(DOXYGEN) || PMIX_HAVE_ATOMIC_MEM_BARRIER */ + + +/********************************************************************** + * + * Atomic spinlocks - always inlined, if have atomic cmpset + * + *********************************************************************/ + +#if !defined(PMIX_HAVE_ATOMIC_SPINLOCKS) && !defined(DOXYGEN) +/* 0 is more like "pending" - we'll fix up at the end after all + the static inline functions are declared */ +#define PMIX_HAVE_ATOMIC_SPINLOCKS 0 +#endif + +#if defined(DOXYGEN) || PMIX_HAVE_ATOMIC_SPINLOCKS || (PMIX_HAVE_ATOMIC_CMPSET_32 || PMIX_HAVE_ATOMIC_CMPSET_64) + +/** + * Initialize a lock to value + * + * @param lock Address of the lock + * @param value Initial value to set lock to + */ +#if PMIX_HAVE_ATOMIC_SPINLOCKS == 0 +static inline +#endif +void pmix_atomic_init(pmix_atomic_lock_t* lock, int32_t value); + + +/** + * Try to acquire a lock. + * + * @param lock Address of the lock. + * @return 0 if the lock was acquired, 1 otherwise. + */ +#if PMIX_HAVE_ATOMIC_SPINLOCKS == 0 +static inline +#endif +int pmix_atomic_trylock(pmix_atomic_lock_t *lock); + + +/** + * Acquire a lock by spinning. + * + * @param lock Address of the lock. + */ +#if PMIX_HAVE_ATOMIC_SPINLOCKS == 0 +static inline +#endif +void pmix_atomic_lock(pmix_atomic_lock_t *lock); + + +/** + * Release a lock. + * + * @param lock Address of the lock. + */ +#if PMIX_HAVE_ATOMIC_SPINLOCKS == 0 +static inline +#endif +void pmix_atomic_unlock(pmix_atomic_lock_t *lock); + + +#if PMIX_HAVE_ATOMIC_SPINLOCKS == 0 +#undef PMIX_HAVE_ATOMIC_SPINLOCKS +#define PMIX_HAVE_ATOMIC_SPINLOCKS (PMIX_HAVE_ATOMIC_CMPSET_32 || PMIX_HAVE_ATOMIC_CMPSET_64) +#define PMIX_NEED_INLINE_ATOMIC_SPINLOCKS 1 +#endif + +#endif /* PMIX_HAVE_ATOMIC_SPINLOCKS */ + + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ +#if !defined(PMIX_HAVE_ATOMIC_CMPSET_32) && !defined(DOXYGEN) +#define PMIX_HAVE_ATOMIC_CMPSET_32 0 +#endif +#if defined(DOXYGEN) || PMIX_HAVE_ATOMIC_CMPSET_32 + +#if PMIX_HAVE_INLINE_ATOMIC_CMPSET_32 +static inline +#endif +int pmix_atomic_cmpset_32(volatile int32_t *addr, int32_t oldval, + int32_t newval); + +#if PMIX_HAVE_INLINE_ATOMIC_CMPSET_32 +static inline +#endif +int pmix_atomic_cmpset_acq_32(volatile int32_t *addr, int32_t oldval, + int32_t newval); + +#if PMIX_HAVE_INLINE_ATOMIC_CMPSET_32 +static inline +#endif +int pmix_atomic_cmpset_rel_32(volatile int32_t *addr, int32_t oldval, + int32_t newval); +#endif + + +#if !defined(PMIX_HAVE_ATOMIC_CMPSET_64) && !defined(DOXYGEN) +#define PMIX_HAVE_ATOMIC_CMPSET_64 0 +#endif +#if defined(DOXYGEN) || PMIX_HAVE_ATOMIC_CMPSET_64 + +#if PMIX_HAVE_INLINE_ATOMIC_CMPSET_64 +static inline +#endif +int pmix_atomic_cmpset_64(volatile int64_t *addr, int64_t oldval, + int64_t newval); + +#if PMIX_HAVE_INLINE_ATOMIC_CMPSET_64 +static inline +#endif +int pmix_atomic_cmpset_acq_64(volatile int64_t *addr, int64_t oldval, + int64_t newval); + +#if PMIX_HAVE_INLINE_ATOMIC_CMPSET_64 +static inline +#endif +int pmix_atomic_cmpset_rel_64(volatile int64_t *addr, int64_t oldval, + int64_t newval); + +#endif + +#if !defined(PMIX_HAVE_ATOMIC_MATH_32) && !defined(DOXYGEN) + /* define to 0 for these tests. WIll fix up later. */ + #define PMIX_HAVE_ATOMIC_MATH_32 0 +#endif + +#if defined(DOXYGEN) || PMIX_HAVE_ATOMIC_MATH_32 || PMIX_HAVE_ATOMIC_CMPSET_32 + +/* PMIX_HAVE_INLINE_ATOMIC_*_32 will be 1 if /atomic.h provides + a static inline version of it (in assembly). If we have to fall + back on cmpset 32, that too will be inline. */ +#if PMIX_HAVE_INLINE_ATOMIC_ADD_32 || (!defined(PMIX_HAVE_ATOMIC_ADD_32) && PMIX_HAVE_ATOMIC_CMPSET_32) +static inline +#endif +int32_t pmix_atomic_add_32(volatile int32_t *addr, int delta); + +/* PMIX_HAVE_INLINE_ATOMIC_*_32 will be 1 if /atomic.h provides + a static inline version of it (in assembly). If we have to fall + back to cmpset 32, that too will be inline. */ +#if PMIX_HAVE_INLINE_ATOMIC_SUB_32 || (!defined(PMIX_HAVE_ATOMIC_ADD_32) && PMIX_HAVE_ATOMIC_CMPSET_32) +static inline +#endif +int32_t pmix_atomic_sub_32(volatile int32_t *addr, int delta); + +#endif /* PMIX_HAVE_ATOMIC_MATH_32 */ + +#if ! PMIX_HAVE_ATOMIC_MATH_32 +/* fix up the value of pmix_have_atomic_math_32 to allow for C versions */ +#undef PMIX_HAVE_ATOMIC_MATH_32 +#define PMIX_HAVE_ATOMIC_MATH_32 PMIX_HAVE_ATOMIC_CMPSET_32 +#endif + +#ifndef PMIX_HAVE_ATOMIC_MATH_64 +/* define to 0 for these tests. WIll fix up later. */ +#define PMIX_HAVE_ATOMIC_MATH_64 0 +#endif + +#if defined(DOXYGEN) || PMIX_HAVE_ATOMIC_MATH_64 || PMIX_HAVE_ATOMIC_CMPSET_64 + +/* PMIX_HAVE_INLINE_ATOMIC_*_64 will be 1 if /atomic.h provides + a static inline version of it (in assembly). If we have to fall + back to cmpset 64, that too will be inline */ +#if PMIX_HAVE_INLINE_ATOMIC_ADD_64 || (!defined(PMIX_HAVE_ATOMIC_ADD_64) && PMIX_HAVE_ATOMIC_CMPSET_64) +static inline +#endif +int64_t pmix_atomic_add_64(volatile int64_t *addr, int64_t delta); + +/* PMIX_HAVE_INLINE_ATOMIC_*_64 will be 1 if /atomic.h provides + a static inline version of it (in assembly). If we have to fall + back to cmpset 64, that too will be inline */ +#if PMIX_HAVE_INLINE_ATOMIC_SUB_64 || (!defined(PMIX_HAVE_ATOMIC_ADD_64) && PMIX_HAVE_ATOMIC_CMPSET_64) +static inline +#endif +int64_t pmix_atomic_sub_64(volatile int64_t *addr, int64_t delta); + +#endif /* PMIX_HAVE_ATOMIC_MATH_32 */ + +#if ! PMIX_HAVE_ATOMIC_MATH_64 +/* fix up the value of pmix_have_atomic_math_64 to allow for C versions */ +#undef PMIX_HAVE_ATOMIC_MATH_64 +#define PMIX_HAVE_ATOMIC_MATH_64 PMIX_HAVE_ATOMIC_CMPSET_64 +#endif + +/* provide a size_t add/subtract. When in debug mode, make it an + * inline function so that we don't have any casts in the + * interface and can catch type errors. When not in debug mode, + * just make it a macro, so that there's no performance penalty + */ +#if defined(DOXYGEN) || PMIX_ENABLE_DEBUG +static inline size_t +pmix_atomic_add_size_t(volatile size_t *addr, int delta) +{ +#if SIZEOF_SIZE_T == 4 + return (size_t) pmix_atomic_add_32((int32_t*) addr, delta); +#elif SIZEOF_SIZE_T == 8 + return (size_t) pmix_atomic_add_64((int64_t*) addr, delta); +#else +#error "Unknown size_t size" +#endif +} +static inline size_t +pmix_atomic_sub_size_t(volatile size_t *addr, int delta) +{ +#if SIZEOF_SIZE_T == 4 + return (size_t) pmix_atomic_sub_32((int32_t*) addr, delta); +#elif SIZEOF_SIZE_T == 8 + return (size_t) pmix_atomic_sub_64((int64_t*) addr, delta); +#else +#error "Unknown size_t size" +#endif +} +#else +#if SIZEOF_SIZE_T == 4 +#define pmix_atomic_add_size_t(addr, delta) ((size_t) pmix_atomic_add_32((int32_t*) addr, delta)) +#define pmix_atomic_sub_size_t(addr, delta) ((size_t) pmix_atomic_sub_32((int32_t*) addr, delta)) +#elif SIZEOF_SIZE_T ==8 +#define pmix_atomic_add_size_t(addr, delta) ((size_t) pmix_atomic_add_64((int64_t*) addr, delta)) +#define pmix_atomic_sub_size_t(addr, delta) ((size_t) pmix_atomic_sub_64((int64_t*) addr, delta)) +#else +#error "Unknown size_t size" +#endif +#endif + +#if defined(DOXYGEN) || (PMIX_HAVE_ATOMIC_CMPSET_32 || PMIX_HAVE_ATOMIC_CMPSET_64) +/* these are always done with inline functions, so always mark as + static inline */ +static inline int pmix_atomic_cmpset_xx(volatile void* addr, int64_t oldval, + int64_t newval, size_t length); +static inline int pmix_atomic_cmpset_acq_xx(volatile void* addr, + int64_t oldval, int64_t newval, + size_t length); +static inline int pmix_atomic_cmpset_rel_xx(volatile void* addr, + int64_t oldval, int64_t newval, + size_t length); + +static inline int pmix_atomic_cmpset_ptr(volatile void* addr, + void* oldval, + void* newval); +static inline int pmix_atomic_cmpset_acq_ptr(volatile void* addr, + void* oldval, + void* newval); +static inline int pmix_atomic_cmpset_rel_ptr(volatile void* addr, + void* oldval, + void* newval); + +/** + * Atomic compare and set of pointer with relaxed semantics. This + * macro detect at compile time the type of the first argument and + * choose the correct function to be called. + * + * \note This macro should only be used for integer types. + * + * @param addr Address of . + * @param oldval Comparison value . + * @param newval New value to set if comparision is true . + * + * See pmix_atomic_cmpset_* for pseudo-code. + */ +#define pmix_atomic_cmpset( ADDR, OLDVAL, NEWVAL ) \ + pmix_atomic_cmpset_xx( (volatile void*)(ADDR), (intptr_t)(OLDVAL), \ + (intptr_t)(NEWVAL), sizeof(*(ADDR)) ) + +/** + * Atomic compare and set of pointer with acquire semantics. This + * macro detect at compile time the type of the first argument + * and choose the correct function to be called. + * + * \note This macro should only be used for integer types. + * + * @param addr Address of . + * @param oldval Comparison value . + * @param newval New value to set if comparision is true . + * + * See pmix_atomic_cmpset_acq_* for pseudo-code. + */ +#define pmix_atomic_cmpset_acq( ADDR, OLDVAL, NEWVAL ) \ + pmix_atomic_cmpset_acq_xx( (volatile void*)(ADDR), (int64_t)(OLDVAL), \ + (int64_t)(NEWVAL), sizeof(*(ADDR)) ) + + +/** + * Atomic compare and set of pointer with release semantics. This + * macro detect at compile time the type of the first argument + * and choose the correct function to b + * + * \note This macro should only be used for integer types. + * + * @param addr Address of . + * @param oldval Comparison value . + * @param newval New value to set if comparision is true . + * + * See pmix_atomic_cmpsetrel_* for pseudo-code. + */ +#define pmix_atomic_cmpset_rel( ADDR, OLDVAL, NEWVAL ) \ + pmix_atomic_cmpset_rel_xx( (volatile void*)(ADDR), (int64_t)(OLDVAL), \ + (int64_t)(NEWVAL), sizeof(*(ADDR)) ) + +#endif /* (PMIX_HAVE_ATOMIC_CMPSET_32 || PMIX_HAVE_ATOMIC_CMPSET_64) */ + +#if defined(DOXYGEN) || (PMIX_HAVE_ATOMIC_MATH_32 || PMIX_HAVE_ATOMIC_MATH_64) + +static inline void pmix_atomic_add_xx(volatile void* addr, + int32_t value, size_t length); +static inline void pmix_atomic_sub_xx(volatile void* addr, + int32_t value, size_t length); +#if SIZEOF_VOID_P == 4 && PMIX_HAVE_ATOMIC_CMPSET_32 +static inline int32_t pmix_atomic_add_ptr( volatile void* addr, void* delta ); +static inline int32_t pmix_atomic_sub_ptr( volatile void* addr, void* delta ); +#elif SIZEOF_VOID_P == 8 && PMIX_HAVE_ATOMIC_CMPSET_64 +static inline int64_t pmix_atomic_add_ptr( volatile void* addr, void* delta ); +static inline int64_t pmix_atomic_sub_ptr( volatile void* addr, void* delta ); +#else +#error Atomic arithmetic on pointers not supported +#endif + +/** + * Atomically increment the content depending on the type. This + * macro detect at compile time the type of the first argument + * and choose the correct function to be called. + * + * \note This macro should only be used for integer types. + * + * @param addr Address of + * @param delta Value to add (converted to ). + */ +#define pmix_atomic_add( ADDR, VALUE ) \ + pmix_atomic_add_xx( (volatile void*)(ADDR), (int32_t)(VALUE), \ + sizeof(*(ADDR)) ) + +/** + * Atomically decrement the content depending on the type. This + * macro detect at compile time the type of the first argument + * and choose the correct function to be called. + * + * \note This macro should only be used for integer types. + * + * @param addr Address of + * @param delta Value to substract (converted to ). + */ +#define pmix_atomic_sub( ADDR, VALUE ) \ + pmix_atomic_sub_xx( (volatile void*)(ADDR), (int32_t)(VALUE), \ + sizeof(*(ADDR)) ) + +#endif /* PMIX_HAVE_ATOMIC_MATH_32 || PMIX_HAVE_ATOMIC_MATH_64 */ + + +/* + * Include inline implementations of everything not defined directly + * in assembly + */ +#include "src/atomics/sys/atomic_impl.h" + +END_C_DECLS + +#endif /* PMIX_SYS_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/atomic_impl.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/atomic_impl.h new file mode 100644 index 00000000000..62213e3a508 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/atomic_impl.h @@ -0,0 +1,439 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* Inline C implementation of the functions defined in atomic.h */ + +#include + +/********************************************************************** + * + * Atomic math operations + * + * All the architectures provide a compare_and_set atomic operations. If + * they dont provide atomic additions and/or substractions then we can + * define these operations using the atomic compare_and_set. + * + * Some architectures do not provide support for the 64 bits + * atomic operations. Until we find a better solution let's just + * undefine all those functions if there is no 64 bit cmpset + * + *********************************************************************/ +#if PMIX_HAVE_ATOMIC_CMPSET_32 + +#if !defined(PMIX_HAVE_ATOMIC_SWAP_32) +#define PMIX_HAVE_ATOMIC_SWAP_32 1 +static inline int32_t pmix_atomic_swap_32(volatile int32_t *addr, + int32_t newval) +{ + int32_t old; + do { + old = *addr; + } while (0 == pmix_atomic_cmpset_32(addr, old, newval)); + + return old; +} +#endif /* PMIX_HAVE_ATOMIC_SWAP_32 */ + +#if !defined(PMIX_HAVE_ATOMIC_ADD_32) +#define PMIX_HAVE_ATOMIC_ADD_32 1 +static inline int32_t +pmix_atomic_add_32(volatile int32_t *addr, int delta) +{ + int32_t oldval; + + do { + oldval = *addr; + } while (0 == pmix_atomic_cmpset_32(addr, oldval, oldval + delta)); + return (oldval + delta); +} +#endif /* PMIX_HAVE_ATOMIC_ADD_32 */ + + +#if !defined(PMIX_HAVE_ATOMIC_SUB_32) +#define PMIX_HAVE_ATOMIC_SUB_32 1 +static inline int32_t +pmix_atomic_sub_32(volatile int32_t *addr, int delta) +{ + int32_t oldval; + + do { + oldval = *addr; + } while (0 == pmix_atomic_cmpset_32(addr, oldval, oldval - delta)); + return (oldval - delta); +} +#endif /* PMIX_HAVE_ATOMIC_SUB_32 */ + +#endif /* PMIX_HAVE_ATOMIC_CMPSET_32 */ + + +#if PMIX_HAVE_ATOMIC_CMPSET_64 + +#if !defined(PMIX_HAVE_ATOMIC_SWAP_64) +#define PMIX_HAVE_ATOMIC_SWAP_64 1 +static inline int64_t pmix_atomic_swap_64(volatile int64_t *addr, + int64_t newval) +{ + int64_t old; + do { + old = *addr; + } while (0 == pmix_atomic_cmpset_64(addr, old, newval)); + return old; +} +#endif /* PMIX_HAVE_ATOMIC_SWAP_32 */ + +#if !defined(PMIX_HAVE_ATOMIC_ADD_64) +#define PMIX_HAVE_ATOMIC_ADD_64 1 +static inline int64_t +pmix_atomic_add_64(volatile int64_t *addr, int64_t delta) +{ + int64_t oldval; + + do { + oldval = *addr; + } while (0 == pmix_atomic_cmpset_64(addr, oldval, oldval + delta)); + return (oldval + delta); +} +#endif /* PMIX_HAVE_ATOMIC_ADD_64 */ + + +#if !defined(PMIX_HAVE_ATOMIC_SUB_64) +#define PMIX_HAVE_ATOMIC_SUB_64 1 +static inline int64_t +pmix_atomic_sub_64(volatile int64_t *addr, int64_t delta) +{ + int64_t oldval; + + do { + oldval = *addr; + } while (0 == pmix_atomic_cmpset_64(addr, oldval, oldval - delta)); + return (oldval - delta); +} +#endif /* PMIX_HAVE_ATOMIC_SUB_64 */ + +#else + +#if !defined(PMIX_HAVE_ATOMIC_ADD_64) +#define PMIX_HAVE_ATOMIC_ADD_64 0 +#endif + +#if !defined(PMIX_HAVE_ATOMIC_SUB_64) +#define PMIX_HAVE_ATOMIC_SUB_64 0 +#endif + +#endif /* PMIX_HAVE_ATOMIC_CMPSET_64 */ + + +#if (PMIX_HAVE_ATOMIC_CMPSET_32 || PMIX_HAVE_ATOMIC_CMPSET_64) + +static inline int +pmix_atomic_cmpset_xx(volatile void* addr, int64_t oldval, + int64_t newval, size_t length) +{ + switch( length ) { +#if PMIX_HAVE_ATOMIC_CMPSET_32 + case 4: + return pmix_atomic_cmpset_32( (volatile int32_t*)addr, + (int32_t)oldval, (int32_t)newval ); +#endif /* PMIX_HAVE_ATOMIC_CMPSET_32 */ + +#if PMIX_HAVE_ATOMIC_CMPSET_64 + case 8: + return pmix_atomic_cmpset_64( (volatile int64_t*)addr, + (int64_t)oldval, (int64_t)newval ); +#endif /* PMIX_HAVE_ATOMIC_CMPSET_64 */ + } + abort(); + /* This should never happen, so deliberately abort (hopefully + leaving a corefile for analysis) */ +} + + +static inline int +pmix_atomic_cmpset_acq_xx(volatile void* addr, int64_t oldval, + int64_t newval, size_t length) +{ + switch( length ) { +#if PMIX_HAVE_ATOMIC_CMPSET_32 + case 4: + return pmix_atomic_cmpset_acq_32( (volatile int32_t*)addr, + (int32_t)oldval, (int32_t)newval ); +#endif /* PMIX_HAVE_ATOMIC_CMPSET_32 */ + +#if PMIX_HAVE_ATOMIC_CMPSET_64 + case 8: + return pmix_atomic_cmpset_acq_64( (volatile int64_t*)addr, + (int64_t)oldval, (int64_t)newval ); +#endif /* PMIX_HAVE_ATOMIC_CMPSET_64 */ + } + /* This should never happen, so deliberately abort (hopefully + leaving a corefile for analysis) */ + abort(); +} + + +static inline int +pmix_atomic_cmpset_rel_xx(volatile void* addr, int64_t oldval, + int64_t newval, size_t length) +{ + switch( length ) { +#if PMIX_HAVE_ATOMIC_CMPSET_32 + case 4: + return pmix_atomic_cmpset_rel_32( (volatile int32_t*)addr, + (int32_t)oldval, (int32_t)newval ); +#endif /* PMIX_HAVE_ATOMIC_CMPSET_32 */ + +#if PMIX_HAVE_ATOMIC_CMPSET_64 + case 8: + return pmix_atomic_cmpset_rel_64( (volatile int64_t*)addr, + (int64_t)oldval, (int64_t)newval ); +#endif /* PMIX_HAVE_ATOMIC_CMPSET_64 */ + } + /* This should never happen, so deliberately abort (hopefully + leaving a corefile for analysis) */ + abort(); +} + + +static inline int +pmix_atomic_cmpset_ptr(volatile void* addr, + void* oldval, + void* newval) +{ +#if SIZEOF_VOID_P == 4 && PMIX_HAVE_ATOMIC_CMPSET_32 + return pmix_atomic_cmpset_32((int32_t*) addr, (unsigned long) oldval, + (unsigned long) newval); +#elif SIZEOF_VOID_P == 8 && PMIX_HAVE_ATOMIC_CMPSET_64 + return pmix_atomic_cmpset_64((int64_t*) addr, (unsigned long) oldval, + (unsigned long) newval); +#else + abort(); +#endif +} + + +static inline int +pmix_atomic_cmpset_acq_ptr(volatile void* addr, + void* oldval, + void* newval) +{ +#if SIZEOF_VOID_P == 4 && PMIX_HAVE_ATOMIC_CMPSET_32 + return pmix_atomic_cmpset_acq_32((int32_t*) addr, (unsigned long) oldval, + (unsigned long) newval); +#elif SIZEOF_VOID_P == 8 && PMIX_HAVE_ATOMIC_CMPSET_64 + return pmix_atomic_cmpset_acq_64((int64_t*) addr, (unsigned long) oldval, + (unsigned long) newval); +#else + abort(); +#endif +} + + +static inline int pmix_atomic_cmpset_rel_ptr(volatile void* addr, + void* oldval, + void* newval) +{ +#if SIZEOF_VOID_P == 4 && PMIX_HAVE_ATOMIC_CMPSET_32 + return pmix_atomic_cmpset_rel_32((int32_t*) addr, (unsigned long) oldval, + (unsigned long) newval); +#elif SIZEOF_VOID_P == 8 && PMIX_HAVE_ATOMIC_CMPSET_64 + return pmix_atomic_cmpset_rel_64((int64_t*) addr, (unsigned long) oldval, + (unsigned long) newval); +#else + abort(); +#endif +} + +#endif /* (PMIX_HAVE_ATOMIC_CMPSET_32 || PMIX_HAVE_ATOMIC_CMPSET_64) */ + +#if (PMIX_HAVE_ATOMIC_SWAP_32 || PMIX_HAVE_ATOMIC_SWAP_64) + +#if SIZEOF_VOID_P == 4 && PMIX_HAVE_ATOMIC_SWAP_32 +#define pmix_atomic_swap_ptr(addr, value) (void *) pmix_atomic_swap_32((int32_t *) addr, (int32_t) value) +#elif SIZEOF_VOID_P == 8 && PMIX_HAVE_ATOMIC_SWAP_64 +#define pmix_atomic_swap_ptr(addr, value) (void *) pmix_atomic_swap_64((int64_t *) addr, (int64_t) value) +#endif + +#endif /* (PMIX_HAVE_ATOMIC_SWAP_32 || PMIX_HAVE_ATOMIC_SWAP_64) */ + +#if (PMIX_HAVE_ATOMIC_LLSC_32 || PMIX_HAVE_ATOMIC_LLSC_64) + +#if SIZEOF_VOID_P == 4 && PMIX_HAVE_ATOMIC_LLSC_32 + +#define pmix_atomic_ll_ptr(addr) (void *) pmix_atomic_ll_32((int32_t *) addr) +#define pmix_atomic_sc_ptr(addr, newval) pmix_atomic_sc_32((int32_t *) addr, (int32_t) newval) + +#define PMIX_HAVE_ATOMIC_LLSC_PTR 1 + +#elif SIZEOF_VOID_P == 8 && PMIX_HAVE_ATOMIC_LLSC_64 + +#define pmix_atomic_ll_ptr(addr) (void *) pmix_atomic_ll_64((int64_t *) addr) +#define pmix_atomic_sc_ptr(addr, newval) pmix_atomic_sc_64((int64_t *) addr, (int64_t) newval) + +#define PMIX_HAVE_ATOMIC_LLSC_PTR 1 + +#endif + +#endif /* (PMIX_HAVE_ATOMIC_LLSC_32 || PMIX_HAVE_ATOMIC_LLSC_64)*/ + +#if !defined(PMIX_HAVE_ATOMIC_LLSC_PTR) +#define PMIX_HAVE_ATOMIC_LLSC_PTR 0 +#endif + +#if PMIX_HAVE_ATOMIC_MATH_32 || PMIX_HAVE_ATOMIC_MATH_64 + + +static inline void +pmix_atomic_add_xx(volatile void* addr, int32_t value, size_t length) +{ + switch( length ) { +#if PMIX_HAVE_ATOMIC_ADD_32 + case 4: + pmix_atomic_add_32( (volatile int32_t*)addr, (int32_t)value ); + break; +#endif /* PMIX_HAVE_ATOMIC_CMPSET_32 */ + +#if PMIX_HAVE_ATOMIC_ADD_64 + case 8: + pmix_atomic_add_64( (volatile int64_t*)addr, (int64_t)value ); + break; +#endif /* PMIX_HAVE_ATOMIC_ADD_64 */ + default: + /* This should never happen, so deliberately abort (hopefully + leaving a corefile for analysis) */ + abort(); + } +} + + +static inline void +pmix_atomic_sub_xx(volatile void* addr, int32_t value, size_t length) +{ + switch( length ) { +#if PMIX_HAVE_ATOMIC_SUB_32 + case 4: + pmix_atomic_sub_32( (volatile int32_t*)addr, (int32_t)value ); + break; +#endif /* PMIX_HAVE_ATOMIC_SUB_32 */ + +#if PMIX_HAVE_ATOMIC_SUB_64 + case 8: + pmix_atomic_sub_64( (volatile int64_t*)addr, (int64_t)value ); + break; +#endif /* PMIX_HAVE_ATOMIC_SUB_64 */ + default: + /* This should never happen, so deliberately abort (hopefully + leaving a corefile for analysis) */ + abort(); + } +} + +#if SIZEOF_VOID_P == 4 && PMIX_HAVE_ATOMIC_ADD_32 +static inline int32_t pmix_atomic_add_ptr( volatile void* addr, + void* delta ) +{ + return pmix_atomic_add_32((int32_t*) addr, (unsigned long) delta); +} +#elif SIZEOF_VOID_P == 8 && PMIX_HAVE_ATOMIC_ADD_64 +static inline int64_t pmix_atomic_add_ptr( volatile void* addr, + void* delta ) +{ + return pmix_atomic_add_64((int64_t*) addr, (unsigned long) delta); +} +#else +static inline int32_t pmix_atomic_add_ptr( volatile void* addr, + void* delta ) +{ + abort(); + return 0; +} +#endif + +#if SIZEOF_VOID_P == 4 && PMIX_HAVE_ATOMIC_SUB_32 +static inline int32_t pmix_atomic_sub_ptr( volatile void* addr, + void* delta ) +{ + return pmix_atomic_sub_32((int32_t*) addr, (unsigned long) delta); +} +#elif SIZEOF_VOID_P == 8 && PMIX_HAVE_ATOMIC_SUB_32 +static inline int64_t pmix_atomic_sub_ptr( volatile void* addr, + void* delta ) +{ + return pmix_atomic_sub_64((int64_t*) addr, (unsigned long) delta); +} +#else +static inline int32_t pmix_atomic_sub_ptr( volatile void* addr, + void* delta ) +{ + abort(); + return 0; +} +#endif + +#endif /* PMIX_HAVE_ATOMIC_MATH_32 || PMIX_HAVE_ATOMIC_MATH_64 */ + +/********************************************************************** + * + * Atomic spinlocks + * + *********************************************************************/ +#ifdef PMIX_NEED_INLINE_ATOMIC_SPINLOCKS + +/* + * Lock initialization function. It set the lock to UNLOCKED. + */ +static inline void +pmix_atomic_init( pmix_atomic_lock_t* lock, int32_t value ) +{ + lock->u.lock = value; +} + + +static inline int +pmix_atomic_trylock(pmix_atomic_lock_t *lock) +{ + int ret = pmix_atomic_cmpset_acq_32( &(lock->u.lock), + PMIX_ATOMIC_UNLOCKED, PMIX_ATOMIC_LOCKED); + return (ret == 0) ? 1 : 0; +} + + +static inline void +pmix_atomic_lock(pmix_atomic_lock_t *lock) +{ + while( !pmix_atomic_cmpset_acq_32( &(lock->u.lock), + PMIX_ATOMIC_UNLOCKED, PMIX_ATOMIC_LOCKED) ) { + while (lock->u.lock == PMIX_ATOMIC_LOCKED) { + /* spin */ ; + } + } +} + + +static inline void +pmix_atomic_unlock(pmix_atomic_lock_t *lock) +{ + pmix_atomic_wmb(); + lock->u.lock=PMIX_ATOMIC_UNLOCKED; +} + +#endif /* PMIX_HAVE_ATOMIC_SPINLOCKS */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/cma.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/cma.h new file mode 100644 index 00000000000..df5bdb79d37 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/cma.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2011-2012 IBM Corporation. All rights reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + */ + +/** @file + * + * Cross Memory Attach syscall definitions. + * + * These are only needed temporarily until these new syscalls + * are incorporated into glibc + */ + +#ifndef PMIX_SYS_CMA_H +#define PMIX_SYS_CMA_H 1 + +#if !defined(PMIX_ASSEMBLY_ARCH) +/* need pmix_config.h for the assembly architecture */ +#include "pmix_config.h" +#endif + +#include "src/atomics/sys/architecture.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif + +#ifdef HAVE_UNISTD_H +#include +#endif + +#ifdef __linux__ + +/* Cross Memory Attach is so far only supported under linux */ + +#if PMIX_ASSEMBLY_ARCH == PMIX_X86_64 +#define __NR_process_vm_readv 310 +#define __NR_process_vm_writev 311 +#elif PMIX_ASSEMBLY_ARCH == PMIX_IA32 +#define __NR_process_vm_readv 347 +#define __NR_process_vm_writev 348 +#elif PMIX_ASSEMBLY_ARCH == PMIX_IA64 +#define __NR_process_vm_readv 1332 +#define __NR_process_vm_writev 1333 +#elif PMIX_ASSEMBLY_ARCH == PMIX_POWERPC32 +#define __NR_process_vm_readv 351 +#define __NR_process_vm_writev 352 +#elif PMIX_ASSEMBLY_ARCH == PMIX_POWERPC64 +#define __NR_process_vm_readv 351 +#define __NR_process_vm_writev 352 +#elif PMIX_ASSEMBLY_ARCH == PMIX_ARM + +#define __NR_process_vm_readv 376 +#define __NR_process_vm_writev 377 + +#elif PMIX_ASSEMBLY_ARCH == PMIX_ARM64 + +/* ARM64 uses the asm-generic syscall numbers */ + +#define __NR_process_vm_readv 270 +#define __NR_process_vm_writev 271 + +#elif PMIX_ASSEMBLY_ARCH == PMIX_MIPS + +#if _MIPS_SIM == _MIPS_SIM_ABI64 + +#define __NR_process_vm_readv 5304 +#define __NR_process_vm_writev 5305 + +#elif _MIPS_SIM == _MIPS_SIM_NABI32 + +#define __NR_process_vm_readv 6309 +#define __NR_process_vm_writev 6310 + +#else + +#error "Unsupported MIPS architecture for process_vm_readv and process_vm_writev syscalls" + +#endif + +#elif PMIX_ASSEMBLY_ARCH == PMIX_S390 + +#define __NR_process_vm_readv 340 +#define __NR_process_vm_writev 341 + +#elif PMIX_ASSEMBLY_ARCH == PMIX_S390X + +#define __NR_process_vm_readv 340 +#define __NR_process_vm_writev 341 + +#else +#error "Unsupported architecture for process_vm_readv and process_vm_writev syscalls" +#endif + + +static inline ssize_t +process_vm_readv(pid_t pid, + const struct iovec *lvec, + unsigned long liovcnt, + const struct iovec *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return syscall(__NR_process_vm_readv, pid, lvec, liovcnt, rvec, riovcnt, flags); +} + +static inline ssize_t +process_vm_writev(pid_t pid, + const struct iovec *lvec, + unsigned long liovcnt, + const struct iovec *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return syscall(__NR_process_vm_writev, pid, lvec, liovcnt, rvec, riovcnt, flags); +} + +#endif /* __linux__ */ + +#endif /* PMIX_SYS_CMA_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/gcc_builtin/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/gcc_builtin/Makefile.include new file mode 100644 index 00000000000..a1476e748f2 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/gcc_builtin/Makefile.include @@ -0,0 +1,26 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2011 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2016 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/gcc_builtin/atomic.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/gcc_builtin/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/gcc_builtin/atomic.h new file mode 100644 index 00000000000..b4d25366000 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/gcc_builtin/atomic.h @@ -0,0 +1,229 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +#include + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 + +#define PMIX_HAVE_ATOMIC_MATH_32 1 +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 +#define PMIX_HAVE_ATOMIC_ADD_32 1 +#define PMIX_HAVE_ATOMIC_SUB_32 1 +#define PMIX_HAVE_ATOMIC_SWAP_32 1 +#define PMIX_HAVE_ATOMIC_MATH_64 1 +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 +#define PMIX_HAVE_ATOMIC_ADD_64 1 +#define PMIX_HAVE_ATOMIC_SUB_64 1 +#define PMIX_HAVE_ATOMIC_SWAP_64 1 + + +static inline void pmix_atomic_mb(void) +{ + __atomic_thread_fence (__ATOMIC_SEQ_CST); +} + +static inline void pmix_atomic_rmb(void) +{ + __atomic_thread_fence (__ATOMIC_ACQUIRE); +} + +static inline void pmix_atomic_wmb(void) +{ + __atomic_thread_fence (__ATOMIC_RELEASE); +} + +#define PMIXMB() pmix_atomic_mb() + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ + +/* + * Suppress numerous (spurious ?) warnings from Oracle Studio compilers + * see https://community.oracle.com/thread/3968347 + */ +#if defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#pragma error_messages(off, E_ARG_INCOMPATIBLE_WITH_ARG_L) +#endif + +static inline int pmix_atomic_cmpset_acq_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + return __atomic_compare_exchange_n (addr, &oldval, newval, false, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); +} + + +static inline int pmix_atomic_cmpset_rel_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + return __atomic_compare_exchange_n (addr, &oldval, newval, false, + __ATOMIC_RELEASE, __ATOMIC_RELAXED); +} + +static inline int pmix_atomic_cmpset_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + return __atomic_compare_exchange_n (addr, &oldval, newval, false, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); +} + +static inline int32_t pmix_atomic_swap_32 (volatile int32_t *addr, int32_t newval) +{ + int32_t oldval; + __atomic_exchange (addr, &newval, &oldval, __ATOMIC_RELAXED); + return oldval; +} + +static inline int32_t pmix_atomic_add_32(volatile int32_t *addr, int32_t delta) +{ + return __atomic_add_fetch (addr, delta, __ATOMIC_RELAXED); +} + +static inline int32_t pmix_atomic_sub_32(volatile int32_t *addr, int32_t delta) +{ + return __atomic_sub_fetch (addr, delta, __ATOMIC_RELAXED); +} + +static inline int pmix_atomic_cmpset_acq_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + return __atomic_compare_exchange_n (addr, &oldval, newval, false, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); +} + +static inline int pmix_atomic_cmpset_rel_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + return __atomic_compare_exchange_n (addr, &oldval, newval, false, + __ATOMIC_RELEASE, __ATOMIC_RELAXED); +} + + +static inline int pmix_atomic_cmpset_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + return __atomic_compare_exchange_n (addr, &oldval, newval, false, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); +} + +static inline int64_t pmix_atomic_swap_64 (volatile int64_t *addr, int64_t newval) +{ + int64_t oldval; + __atomic_exchange (addr, &newval, &oldval, __ATOMIC_RELAXED); + return oldval; +} + +static inline int64_t pmix_atomic_add_64(volatile int64_t *addr, int64_t delta) +{ + return __atomic_add_fetch (addr, delta, __ATOMIC_RELAXED); +} + +static inline int64_t pmix_atomic_sub_64(volatile int64_t *addr, int64_t delta) +{ + return __atomic_sub_fetch (addr, delta, __ATOMIC_RELAXED); +} + +#if PMIX_HAVE_GCC_BUILTIN_CSWAP_INT128 + +#define PMIX_HAVE_ATOMIC_CMPSET_128 1 + +static inline int pmix_atomic_cmpset_128 (volatile pmix_int128_t *addr, + pmix_int128_t oldval, pmix_int128_t newval) +{ + return __atomic_compare_exchange_n (addr, &oldval, newval, false, + __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); +} + +#elif defined(PMIX_HAVE_SYNC_BUILTIN_CSWAP_INT128) && PMIX_HAVE_SYNC_BUILTIN_CSWAP_INT128 + +#define PMIX_HAVE_ATOMIC_CMPSET_128 1 + +/* __atomic version is not lock-free so use legacy __sync version */ + +static inline int pmix_atomic_cmpset_128 (volatile pmix_int128_t *addr, + pmix_int128_t oldval, pmix_int128_t newval) +{ + return __sync_bool_compare_and_swap (addr, oldval, newval); +} + +#endif + +#if defined(__HLE__) + +#include + +#define PMIX_HAVE_ATOMIC_SPINLOCKS 1 + +static inline void pmix_atomic_init (pmix_atomic_lock_t* lock, int32_t value) +{ + lock->u.lock = value; +} + +static inline int pmix_atomic_trylock(pmix_atomic_lock_t *lock) +{ + int ret = __atomic_exchange_n (&lock->u.lock, PMIX_ATOMIC_LOCKED, + __ATOMIC_ACQUIRE | __ATOMIC_HLE_ACQUIRE); + if (PMIX_ATOMIC_LOCKED == ret) { + /* abort the transaction */ + _mm_pause (); + return 1; + } + + return 0; +} + +static inline void pmix_atomic_lock (pmix_atomic_lock_t *lock) +{ + while (PMIX_ATOMIC_LOCKED == __atomic_exchange_n (&lock->u.lock, PMIX_ATOMIC_LOCKED, + __ATOMIC_ACQUIRE | __ATOMIC_HLE_ACQUIRE)) { + /* abort the transaction */ + _mm_pause (); + } +} + +static inline void pmix_atomic_unlock (pmix_atomic_lock_t *lock) +{ + __atomic_store_n (&lock->u.lock, PMIX_ATOMIC_UNLOCKED, + __ATOMIC_RELEASE | __ATOMIC_HLE_RELEASE); +} + +#endif + +#if defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#pragma error_messages(default, E_ARG_INCOMPATIBLE_WITH_ARG_L) +#endif + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/Makefile.include new file mode 100644 index 00000000000..799a43d7e9d --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/Makefile.include @@ -0,0 +1,24 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/ia32/atomic.h \ + atomics/sys/ia32/timer.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/atomic.h new file mode 100644 index 00000000000..85693ad996b --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/atomic.h @@ -0,0 +1,223 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2010 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +/* + * On ia32, we use cmpxchg. + */ + +#define PMIXSMPLOCK "lock; " +#define PMIXMB() __asm__ __volatile__("": : :"memory") + + +/********************************************************************** + * + * Define constants for IA32 + * + *********************************************************************/ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 + +#define PMIX_HAVE_ATOMIC_MATH_32 1 +#define PMIX_HAVE_ATOMIC_ADD_32 1 +#define PMIX_HAVE_ATOMIC_SUB_32 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 + +#undef PMIX_HAVE_INLINE_ATOMIC_CMPSET_64 +#define PMIX_HAVE_INLINE_ATOMIC_CMPSET_64 0 + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline void pmix_atomic_mb(void) +{ + PMIXMB(); +} + + +static inline void pmix_atomic_rmb(void) +{ + PMIXMB(); +} + + +static inline void pmix_atomic_wmb(void) +{ + PMIXMB(); +} + +static inline void pmix_atomic_isync(void) +{ +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int pmix_atomic_cmpset_32(volatile int32_t *addr, + int32_t oldval, + int32_t newval) +{ + unsigned char ret; + __asm__ __volatile__ ( + PMIXSMPLOCK "cmpxchgl %3,%2 \n\t" + "sete %0 \n\t" + : "=qm" (ret), "+a" (oldval), "+m" (*addr) + : "q"(newval) + : "memory", "cc"); + + return (int)ret; +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#define pmix_atomic_cmpset_acq_32 pmix_atomic_cmpset_32 +#define pmix_atomic_cmpset_rel_32 pmix_atomic_cmpset_32 + +#if PMIX_GCC_INLINE_ASSEMBLY + +#if 0 + +/* some versions of GCC won't let you use ebx period (even though they + should be able to save / restore for the life of the inline + assembly). For the beta, just use the non-inline version */ + +#ifndef ll_low /* GLIBC provides these somewhere, so protect */ +#define ll_low(x) *(((unsigned int*)&(x))+0) +#define ll_high(x) *(((unsigned int*)&(x))+1) +#endif + +/* On Linux the EBX register is used by the shared libraries + * to keep the global offset. In same time this register is + * required by the cmpxchg8b instruction (as an input parameter). + * This conflict force us to save the EBX before the cmpxchg8b + * and to restore it afterward. + */ +static inline int pmix_atomic_cmpset_64(volatile int64_t *addr, + int64_t oldval, + int64_t newval) +{ + /* + * Compare EDX:EAX with m64. If equal, set ZF and load ECX:EBX into + * m64. Else, clear ZF and load m64 into EDX:EAX. + */ + unsigned char ret; + + __asm__ __volatile__( + "push %%ebx \n\t" + "movl %4, %%ebx \n\t" + SMPLOCK "cmpxchg8b (%1) \n\t" + "sete %0 \n\t" + "pop %%ebx \n\t" + : "=qm"(ret) + : "D"(addr), "a"(ll_low(oldval)), "d"(ll_high(oldval)), + "r"(ll_low(newval)), "c"(ll_high(newval)) + : "cc", "memory", "ebx"); + return (int) ret; +} +#endif /* if 0 */ + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#define pmix_atomic_cmpset_acq_64 pmix_atomic_cmpset_64 +#define pmix_atomic_cmpset_rel_64 pmix_atomic_cmpset_64 + +#if PMIX_GCC_INLINE_ASSEMBLY + +#define PMIX_HAVE_ATOMIC_SWAP_32 1 + +static inline int32_t pmix_atomic_swap_32( volatile int32_t *addr, + int32_t newval) +{ + int32_t oldval; + + __asm__ __volatile__("xchg %1, %0" : + "=r" (oldval), "=m" (*addr) : + "0" (newval), "m" (*addr) : + "memory"); + return oldval; +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +#if PMIX_GCC_INLINE_ASSEMBLY + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type int + * + * Atomically adds @i to @v. + */ +static inline int32_t pmix_atomic_add_32(volatile int32_t* v, int i) +{ + int ret = i; + __asm__ __volatile__( + PMIXSMPLOCK "xaddl %1,%0" + :"+m" (*v), "+r" (ret) + : + :"memory", "cc" + ); + return (ret+i); +} + + +/** + * atomic_sub - subtract the atomic variable + * @i: integer value to subtract + * @v: pointer of type int + * + * Atomically subtracts @i from @v. + */ +static inline int32_t pmix_atomic_sub_32(volatile int32_t* v, int i) +{ + int ret = -i; + __asm__ __volatile__( + PMIXSMPLOCK "xaddl %1,%0" + :"+m" (*v), "+r" (ret) + : + :"memory", "cc" + ); + return (ret-i); +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/timer.h new file mode 100644 index 00000000000..5be92d4902d --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia32/timer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_TIMER_H +#define PMIX_SYS_ARCH_TIMER_H 1 + + +typedef uint64_t pmix_timer_t; + +/* Using RDTSC(P) results in non-monotonic timers across cores */ +#undef PMIX_TIMER_MONOTONIC +#define PMIX_TIMER_MONOTONIC 0 + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + pmix_timer_t ret; + int tmp; + + __asm__ __volatile__( + "xchgl %%ebx, %1\n" + "cpuid\n" + "xchgl %%ebx, %1\n" + "rdtsc\n" + : "=A"(ret), "=r"(tmp) + :: "ecx"); + + return ret; +} + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#else + +pmix_timer_t pmix_sys_timer_get_cycles(void); + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/Makefile.include new file mode 100644 index 00000000000..d1f4e5e4b62 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/Makefile.include @@ -0,0 +1,24 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/ia64/atomic.h \ + atomics/sys/ia64/timer.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/atomic.h new file mode 100644 index 00000000000..ca8ce8dfdde --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/atomic.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +/* + * On ia64, we use cmpxchg, which supports acquire/release semantics natively. + */ + + +#define PMIXMB() __asm__ __volatile__("mf": : :"memory") + + +/********************************************************************** + * + * Define constants for IA64 + * + *********************************************************************/ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline void pmix_atomic_mb(void) +{ + PMIXMB(); +} + + +static inline void pmix_atomic_rmb(void) +{ + PMIXMB(); +} + + +static inline void pmix_atomic_wmb(void) +{ + PMIXMB(); +} + +static inline void pmix_atomic_isync(void) +{ +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +#define ia64_cmpxchg4_acq(ptr, new, old) \ +({ \ + __u64 ia64_intri_res; \ + ia64_intri_res; \ +}) + +static inline int pmix_atomic_cmpset_acq_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int64_t ret; + + __asm__ __volatile__ ("mov ar.ccv=%0;;" :: "rO"(oldval)); + __asm__ __volatile__ ("cmpxchg4.acq %0=[%1],%2,ar.ccv": + "=r"(ret) : "r"(addr), "r"(newval) : "memory"); + + return ((int32_t)ret == oldval); +} + + +static inline int pmix_atomic_cmpset_rel_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int64_t ret; + + __asm__ __volatile__ ("mov ar.ccv=%0;;" :: "rO"(oldval)); + __asm__ __volatile__ ("cmpxchg4.rel %0=[%1],%2,ar.ccv": + "=r"(ret) : "r"(addr), "r"(newval) : "memory"); + + return ((int32_t)ret == oldval); +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +#define pmix_atomic_cmpset_32 pmix_atomic_cmpset_acq_32 + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int pmix_atomic_cmpset_acq_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int64_t ret; + + __asm__ __volatile__ ("mov ar.ccv=%0;;" :: "rO"(oldval)); + __asm__ __volatile__ ("cmpxchg8.acq %0=[%1],%2,ar.ccv": + "=r"(ret) : "r"(addr), "r"(newval) : "memory"); + + return (ret == oldval); +} + + +static inline int pmix_atomic_cmpset_rel_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int64_t ret; + + __asm__ __volatile__ ("mov ar.ccv=%0;;" :: "rO"(oldval)); + __asm__ __volatile__ ("cmpxchg8.rel %0=[%1],%2,ar.ccv": + "=r"(ret) : "r"(addr), "r"(newval) : "memory"); + + return (ret == oldval); +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#define pmix_atomic_cmpset_64 pmix_atomic_cmpset_acq_64 + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/timer.h new file mode 100644 index 00000000000..5a33236592d --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/ia64/timer.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_TIMER_H +#define PMIX_SYS_ARCH_TIMER_H 1 + + +typedef uint64_t pmix_timer_t; + + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + pmix_timer_t ret; + + __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret)); + + return ret; +} + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#else + +pmix_timer_t pmix_sys_timer_get_cycles(void); + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/Makefile.include new file mode 100644 index 00000000000..f3916e581da --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/Makefile.include @@ -0,0 +1,24 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2008 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/mips/atomic.h \ + atomics/sys/mips/timer.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/atomic.h new file mode 100644 index 00000000000..2e0765d9e2f --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/atomic.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + + +/* BWB - FIX ME! */ +#ifdef __linux__ +#define PMIXMB() __asm__ __volatile__(".set mips2; sync; .set mips0": : :"memory") +#define PMIXRMB() __asm__ __volatile__(".set mips2; sync; .set mips0": : :"memory") +#define PMIXWMB() __asm__ __volatile__(".set mips2; sync; .set mips0": : :"memory") +#define PMIXSMP_SYNC ".set mips2; sync; .set mips0" +#else +#define PMIXMB() __asm__ __volatile__("sync": : :"memory") +#define PMIXRMB() __asm__ __volatile__("sync": : :"memory") +#define PMIXWMB() __asm__ __volatile__("sync": : :"memory") +#define PMIXSMP_SYNC "sync" +#endif + + +/********************************************************************** + * + * Define constants for MIPS + * + *********************************************************************/ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 + +#ifdef __mips64 +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 +#endif + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline +void pmix_atomic_mb(void) +{ + PMIXMB(); +} + + +static inline +void pmix_atomic_rmb(void) +{ + PMIXRMB(); +} + + +static inline +void pmix_atomic_wmb(void) +{ + PMIXWMB(); +} + +static inline +void pmix_atomic_isync(void) +{ +} + +#endif + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int pmix_atomic_cmpset_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int32_t ret; + + __asm__ __volatile__ (".set noreorder \n" + ".set noat \n" + "1: \n" +#ifdef __linux__ + ".set mips2 \n\t" +#endif + "ll %0, %2 \n" /* load *addr into ret */ + "bne %0, %z3, 2f \n" /* done if oldval != ret */ + "or $1, %z4, 0 \n" /* tmp = newval (delay slot) */ + "sc $1, %2 \n" /* store tmp in *addr */ +#ifdef __linux__ + ".set mips0 \n\t" +#endif + /* note: ret will be 0 if failed, 1 if succeeded */ + "beqz $1, 1b \n" /* if 0 jump back to 1b */ + "nop \n" /* fill delay slots */ + "2: \n" + ".set reorder \n" + : "=&r"(ret), "=m"(*addr) + : "m"(*addr), "r"(oldval), "r"(newval) + : "cc", "memory"); + return (ret == oldval); +} + + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_32 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_32(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_32(addr, oldval, newval); +} + +#ifdef PMIX_HAVE_ATOMIC_CMPSET_64 +static inline int pmix_atomic_cmpset_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int64_t ret; + + __asm__ __volatile__ (".set noreorder \n" + ".set noat \n" + "1: \n\t" + "lld %0, %2 \n\t" /* load *addr into ret */ + "bne %0, %z3, 2f \n\t" /* done if oldval != ret */ + "or $1, %4, 0 \n\t" /* tmp = newval (delay slot) */ + "scd $1, %2 \n\t" /* store tmp in *addr */ + /* note: ret will be 0 if failed, 1 if succeeded */ + "beqz $1, 1b \n\t" /* if 0 jump back to 1b */ + "nop \n\t" /* fill delay slot */ + "2: \n\t" + ".set reorder \n" + : "=&r" (ret), "=m" (*addr) + : "m" (*addr), "r" (oldval), "r" (newval) + : "cc", "memory"); + + return (ret == oldval); +} + + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_64 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_64(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_64(addr, oldval, newval); +} +#endif /* PMIX_HAVE_ATOMIC_CMPSET_64 */ + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/timer.h new file mode 100644 index 00000000000..65532ac8a77 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/mips/timer.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_TIMER_H +#define PMIX_SYS_ARCH_TIMER_H 1 + +#include + +typedef uint64_t pmix_timer_t; + +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + pmix_timer_t ret; + struct tms accurate_clock; + + times(&accurate_clock); + ret = accurate_clock.tms_utime + accurate_clock.tms_stime; + + return ret; +} + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#endif /* ! PMIX_SYS_ARCH_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/Makefile.include new file mode 100644 index 00000000000..fee4119debe --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/Makefile.include @@ -0,0 +1,24 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/powerpc/atomic.h \ + atomics/sys/powerpc/timer.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/atomic.h new file mode 100644 index 00000000000..98fbccbbfc3 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/atomic.h @@ -0,0 +1,464 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +/* + * On powerpc ... + */ + +#define PMIXMB() __asm__ __volatile__ ("sync" : : : "memory") +#define PMIXRMB() __asm__ __volatile__ ("lwsync" : : : "memory") +#define PMIXWMB() __asm__ __volatile__ ("eieio" : : : "memory") +#define PMIXISYNC() __asm__ __volatile__ ("isync" : : : "memory") +#define PMIXSMP_SYNC "sync \n\t" +#define PMIXSMP_ISYNC "\n\tisync" + + +/********************************************************************** + * + * Define constants for PowerPC 32 + * + *********************************************************************/ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 +#define PMIX_HAVE_ATOMIC_SWAP_32 1 +#define PMIX_HAVE_ATOMIC_LLSC_32 1 + +#define PMIX_HAVE_ATOMIC_MATH_32 1 +#define PMIX_HAVE_ATOMIC_ADD_32 1 +#define PMIX_HAVE_ATOMIC_SUB_32 1 + + +#if (PMIX_ASSEMBLY_ARCH == PMIX_POWERPC64) || PMIX_ASM_SUPPORT_64BIT +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 +#define PMIX_HAVE_ATOMIC_SWAP_64 1 +#define PMIX_HAVE_ATOMIC_LLSC_64 1 +#define PMIX_HAVE_ATOMIC_MATH_64 1 +#define PMIX_HAVE_ATOMIC_ADD_64 1 +#define PMIX_HAVE_ATOMIC_SUB_64 1 +#endif + + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline +void pmix_atomic_mb(void) +{ + PMIXMB(); +} + + +static inline +void pmix_atomic_rmb(void) +{ + PMIXRMB(); +} + + +static inline +void pmix_atomic_wmb(void) +{ + PMIXRMB(); +} + +static inline +void pmix_atomic_isync(void) +{ + PMIXISYNC(); +} + +#elif PMIX_XLC_INLINE_ASSEMBLY /* end PMIX_GCC_INLINE_ASSEMBLY */ + +/* Yeah, I don't know who thought this was a reasonable syntax for + * inline assembly. Do these because they are used so often and they + * are fairly simple (aka: there is a tech pub on IBM's web site + * containing the right hex for the instructions). + */ + +#undef PMIX_HAVE_INLINE_ATOMIC_MEM_BARRIER +#define PMIX_HAVE_INLINE_ATOMIC_MEM_BARRIER 0 + +#pragma mc_func pmix_atomic_mb { "7c0004ac" } /* sync */ +#pragma reg_killed_by pmix_atomic_mb /* none */ + +#pragma mc_func pmix_atomic_rmb { "7c2004ac" } /* lwsync */ +#pragma reg_killed_by pmix_atomic_rmb /* none */ + +#pragma mc_func pmix_atomic_wmb { "7c0006ac" } /* eieio */ +#pragma reg_killed_by pmix_atomic_wmb /* none */ + +#endif + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +#ifdef __xlC__ +/* work-around bizzare xlc bug in which it sign-extends + a pointer to a 32-bit signed integer */ +#define PMIX_ASM_ADDR(a) ((uintptr_t)a) +#else +#define PMIX_ASM_ADDR(a) (a) +#endif + +#if defined(__PGI) +/* work-around for bug in PGI 16.5-16.7 where the compiler fails to + * correctly emit load instructions for 64-bit operands. without this + * it will emit lwz instead of ld to load the 64-bit operand. */ +#define PMIX_ASM_VALUE64(x) (void *)(intptr_t) (x) +#else +#define PMIX_ASM_VALUE64(x) x +#endif + + +static inline int pmix_atomic_cmpset_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int32_t ret; + + __asm__ __volatile__ ( + "1: lwarx %0, 0, %2 \n\t" + " cmpw 0, %0, %3 \n\t" + " bne- 2f \n\t" + " stwcx. %4, 0, %2 \n\t" + " bne- 1b \n\t" + "2:" + : "=&r" (ret), "=m" (*addr) + : "r" PMIX_ASM_ADDR(addr), "r" (oldval), "r" (newval), "m" (*addr) + : "cc", "memory"); + + return (ret == oldval); +} + +static inline int32_t pmix_atomic_ll_32 (volatile int32_t *addr) +{ + int32_t ret; + + __asm__ __volatile__ ("lwarx %0, 0, %1 \n\t" + : "=&r" (ret) + : "r" (addr) + ); + return ret; +} + +static inline int pmix_atomic_sc_32 (volatile int32_t *addr, int32_t newval) +{ + int32_t ret, foo; + + __asm__ __volatile__ (" stwcx. %4, 0, %3 \n\t" + " li %0,0 \n\t" + " bne- 1f \n\t" + " ori %0,%0,1 \n\t" + "1:" + : "=r" (ret), "=m" (*addr), "=r" (foo) + : "r" (addr), "r" (newval) + : "cc", "memory"); + return ret; +} + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_32 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_32(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_32(volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_32(addr, oldval, newval); +} + +static inline int32_t pmix_atomic_swap_32(volatile int32_t *addr, int32_t newval) +{ + int32_t ret; + + __asm__ __volatile__ ("1: lwarx %0, 0, %2 \n\t" + " stwcx. %3, 0, %2 \n\t" + " bne- 1b \n\t" + : "=&r" (ret), "=m" (*addr) + : "r" (addr), "r" (newval) + : "cc", "memory"); + + return ret; +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +#if (PMIX_ASSEMBLY_ARCH == PMIX_POWERPC64) + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int64_t pmix_atomic_add_64 (volatile int64_t* v, int64_t inc) +{ + int64_t t; + + __asm__ __volatile__("1: ldarx %0, 0, %3 \n\t" + " add %0, %2, %0 \n\t" + " stdcx. %0, 0, %3 \n\t" + " bne- 1b \n\t" + : "=&r" (t), "=m" (*v) + : "r" (PMIX_ASM_VALUE64(inc)), "r" PMIX_ASM_ADDR(v), "m" (*v) + : "cc"); + + return t; +} + + +static inline int64_t pmix_atomic_sub_64 (volatile int64_t* v, int64_t dec) +{ + int64_t t; + + __asm__ __volatile__( + "1: ldarx %0,0,%3 \n\t" + " subf %0,%2,%0 \n\t" + " stdcx. %0,0,%3 \n\t" + " bne- 1b \n\t" + : "=&r" (t), "=m" (*v) + : "r" (PMIX_ASM_VALUE64(dec)), "r" PMIX_ASM_ADDR(v), "m" (*v) + : "cc"); + + return t; +} + +static inline int pmix_atomic_cmpset_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int64_t ret; + + __asm__ __volatile__ ( + "1: ldarx %0, 0, %2 \n\t" + " cmpd 0, %0, %3 \n\t" + " bne- 2f \n\t" + " stdcx. %4, 0, %2 \n\t" + " bne- 1b \n\t" + "2:" + : "=&r" (ret), "=m" (*addr) + : "r" (addr), "r" (PMIX_ASM_VALUE64(oldval)), "r" (PMIX_ASM_VALUE64(newval)), "m" (*addr) + : "cc", "memory"); + + return (ret == oldval); +} + +static inline int64_t pmix_atomic_ll_64(volatile int64_t *addr) +{ + int64_t ret; + + __asm__ __volatile__ ("ldarx %0, 0, %1 \n\t" + : "=&r" (ret) + : "r" (addr) + ); + return ret; +} + +static inline int pmix_atomic_sc_64(volatile int64_t *addr, int64_t newval) +{ + int32_t ret; + + __asm__ __volatile__ (" stdcx. %2, 0, %1 \n\t" + " li %0,0 \n\t" + " bne- 1f \n\t" + " ori %0,%0,1 \n\t" + "1:" + : "=r" (ret) + : "r" (addr), "r" (PMIX_ASM_VALUE64(newval)) + : "cc", "memory"); + return ret; +} + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_64 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_64(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_64(addr, oldval, newval); +} + +static inline int64_t pmix_atomic_swap_64(volatile int64_t *addr, int64_t newval) +{ + int64_t ret; + + __asm__ __volatile__ ("1: ldarx %0, 0, %2 \n\t" + " stdcx. %3, 0, %2 \n\t" + " bne- 1b \n\t" + : "=&r" (ret), "=m" (*addr) + : "r" (addr), "r" (PMIX_ASM_VALUE64(newval)) + : "cc", "memory"); + + return ret; +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#elif (PMIX_ASSEMBLY_ARCH == PMIX_POWERPC32) && PMIX_ASM_SUPPORT_64BIT + +#ifndef ll_low /* GLIBC provides these somewhere, so protect */ +#define ll_low(x) *(((unsigned int*)&(x))+0) +#define ll_high(x) *(((unsigned int*)&(x))+1) +#endif + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int pmix_atomic_cmpset_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int ret; + + /* + * We force oldval and newval into memory because PPC doesn't + * appear to have a way to do a move register with offset. Since + * this is 32-bit code, a 64 bit integer will be loaded into two + * registers (assuming no inlining, addr will be in r3, oldval + * will be in r4 and r5, and newval will be r6 and r7. We need + * to load the whole thing into one register. So we have the + * compiler push the values into memory and load the double word + * into registers. We use r4,r5 so that the main block of code + * is very similar to the pure 64 bit version. + */ + __asm__ __volatile__ ( + "ld r4,%2 \n\t" + "ld r5,%3 \n\t" + "1: ldarx r9, 0, %1 \n\t" + " cmpd 0, r9, r4 \n\t" + " bne- 2f \n\t" + " stdcx. r5, 0, %1 \n\t" + " bne- 1b \n\t" + "2: \n\t" + "xor r5,r4,r9 \n\t" + "subfic r9,r5,0 \n\t" + "adde %0,r9,r5 \n\t" + : "=&r" (ret) + : "r"PMIX_ASM_ADDR(addr), + "m"(oldval), "m"(newval) + : "r4", "r5", "r9", "cc", "memory"); + + return ret; +} + +/* these two functions aren't inlined in the non-gcc case because then + there would be two function calls (since neither cmpset_64 nor + atomic_?mb can be inlined). Instead, we "inline" them by hand in + the assembly, meaning there is one function call overhead instead + of two */ +static inline int pmix_atomic_cmpset_acq_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_64(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_64(volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_64(addr, oldval, newval); +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* PMIX_ASM_SUPPORT_64BIT */ + + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int32_t pmix_atomic_add_32(volatile int32_t* v, int inc) +{ + int32_t t; + + __asm__ __volatile__( + "1: lwarx %0, 0, %3 \n\t" + " add %0, %2, %0 \n\t" + " stwcx. %0, 0, %3 \n\t" + " bne- 1b \n\t" + : "=&r" (t), "=m" (*v) + : "r" (inc), "r" PMIX_ASM_ADDR(v), "m" (*v) + : "cc"); + + return t; +} + + +static inline int32_t pmix_atomic_sub_32(volatile int32_t* v, int dec) +{ + int32_t t; + + __asm__ __volatile__( + "1: lwarx %0,0,%3 \n\t" + " subf %0,%2,%0 \n\t" + " stwcx. %0,0,%3 \n\t" + " bne- 1b \n\t" + : "=&r" (t), "=m" (*v) + : "r" (dec), "r" PMIX_ASM_ADDR(v), "m" (*v) + : "cc"); + + return t; +} + + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/timer.h new file mode 100644 index 00000000000..dd8c3ffe1b6 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/powerpc/timer.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_TIMER_H +#define PMIX_SYS_ARCH_TIMER_H 1 + + +typedef uint64_t pmix_timer_t; + + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + unsigned int tbl, tbu0, tbu1; + + do { + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); + __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); + } while (tbu0 != tbu1); + + return (((unsigned long long)tbu0) << 32) | tbl; +} + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#else + +pmix_timer_t pmix_sys_timer_get_cycles(void); + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/Makefile.include new file mode 100644 index 00000000000..f2ad630bf69 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/Makefile.include @@ -0,0 +1,24 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/sparcv9/atomic.h \ + atomics/sys/sparcv9/timer.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/atomic.h new file mode 100644 index 00000000000..9d41bde0a44 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/atomic.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserverd. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +/* + * On sparc v9, use casa and casxa (compare and swap) instructions. + */ + +#define PMIXASI_P "0x80" + +#define PMIXMEMBAR(type) __asm__ __volatile__ ("membar " type : : : "memory") + + +/********************************************************************** + * + * Define constants for Sparc v9 (Ultra Sparc) + * + *********************************************************************/ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 + + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline void pmix_atomic_mb(void) +{ + PMIXMEMBAR("#LoadLoad | #LoadStore | #StoreStore | #StoreLoad"); +} + + +static inline void pmix_atomic_rmb(void) +{ + PMIXMEMBAR("#LoadLoad"); +} + + +static inline void pmix_atomic_wmb(void) +{ + PMIXMEMBAR("#StoreStore"); +} + +static inline void pmix_atomic_isync(void) +{ +} + + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int pmix_atomic_cmpset_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + /* casa [reg(rs1)] %asi, reg(rs2), reg(rd) + * + * if (*(reg(rs1)) == reg(rs2) ) + * swap reg(rd), *(reg(rs1)) + * else + * reg(rd) = *(reg(rs1)) + */ + + int32_t ret = newval; + + __asm__ __volatile__("casa [%1] " PMIXASI_P ", %2, %0" + : "+r" (ret) + : "r" (addr), "r" (oldval)); + return (ret == oldval); +} + + +static inline int pmix_atomic_cmpset_acq_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_32(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_32(addr, oldval, newval); +} + + +#if PMIX_ASSEMBLY_ARCH == PMIX_SPARCV9_64 + +static inline int pmix_atomic_cmpset_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + /* casa [reg(rs1)] %asi, reg(rs2), reg(rd) + * + * if (*(reg(rs1)) == reg(rs1) ) + * swap reg(rd), *(reg(rs1)) + * else + * reg(rd) = *(reg(rs1)) + */ + int64_t ret = newval; + + __asm__ __volatile__("casxa [%1] " PMIXASI_P ", %2, %0" + : "+r" (ret) + : "r" (addr), "r" (oldval)); + return (ret == oldval); +} + +#else /* PMIX_ASSEMBLY_ARCH == PMIX_SPARCV9_64 */ + +static inline int pmix_atomic_cmpset_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + /* casa [reg(rs1)] %asi, reg(rs2), reg(rd) + * + * if (*(reg(rs1)) == reg(rs1) ) + * swap reg(rd), *(reg(rs1)) + * else + * reg(rd) = *(reg(rs1)) + * + */ + long long ret = newval; + + __asm__ __volatile__( + "ldx %0, %%g1 \n\t" /* g1 = ret */ + "ldx %2, %%g2 \n\t" /* g2 = oldval */ + "casxa [%1] " PMIXASI_P ", %%g2, %%g1 \n\t" + "stx %%g1, %0 \n" + : "+m"(ret) + : "r"(addr), "m"(oldval) + : "%g1", "%g2" + ); + + return (ret == oldval); +} + +#endif /* PMIX_ASSEMBLY_ARCH == PMIX_SPARCV9_64 */ + +static inline int pmix_atomic_cmpset_acq_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + int rc; + + rc = pmix_atomic_cmpset_64(addr, oldval, newval); + pmix_atomic_rmb(); + + return rc; +} + + +static inline int pmix_atomic_cmpset_rel_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + pmix_atomic_wmb(); + return pmix_atomic_cmpset_64(addr, oldval, newval); +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/timer.h new file mode 100644 index 00000000000..395ea986014 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sparcv9/timer.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_TIMER_H +#define PMIX_SYS_ARCH_TIMER_H 1 + +typedef uint64_t pmix_timer_t; + +#if PMIX_GCC_INLINE_ASSEMBLY + + +#if PMIX_ASSEMBLY_ARCH == PMIX_SPARCV9_64 + +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + pmix_timer_t ret; + + __asm__ __volatile__("rd %%tick, %0" : "=r"(ret)); + + return ret; +} + +#else /* PMIX_SPARCV9_32 */ + +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + pmix_timer_t ret; + int a, b; + + __asm__ __volatile__("rd %%tick, %0 \n" + "srlx %0, 32, %1 " : + "=r"(a), "=r"(b) + ); + + ret = (0x00000000FFFFFFFF & a) | (((pmix_timer_t) b) << 32); + + return ret; +} + +#endif + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#else + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 0 + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sync_builtin/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sync_builtin/Makefile.include new file mode 100644 index 00000000000..a57977a81e3 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sync_builtin/Makefile.include @@ -0,0 +1,24 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2011 Sandia National Laboratories. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/sync_builtin/atomic.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sync_builtin/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sync_builtin/atomic.h new file mode 100644 index 00000000000..51a9a1409b7 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/sync_builtin/atomic.h @@ -0,0 +1,137 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 + +static inline void pmix_atomic_mb(void) +{ + __sync_synchronize(); +} + +static inline void pmix_atomic_rmb(void) +{ + __sync_synchronize(); +} + +static inline void pmix_atomic_wmb(void) +{ + __sync_synchronize(); +} + +#define MB() pmix_atomic_mb() + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 +static inline int pmix_atomic_cmpset_acq_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + return __sync_bool_compare_and_swap(addr, oldval, newval); +} + + +static inline int pmix_atomic_cmpset_rel_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + return __sync_bool_compare_and_swap(addr, oldval, newval);} + +static inline int pmix_atomic_cmpset_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + return __sync_bool_compare_and_swap(addr, oldval, newval); +} + +#define PMIX_HAVE_ATOMIC_MATH_32 1 + +#define PMIX_HAVE_ATOMIC_ADD_32 1 +static inline int32_t pmix_atomic_add_32(volatile int32_t *addr, int32_t delta) +{ + return __sync_add_and_fetch(addr, delta); +} + +#define PMIX_HAVE_ATOMIC_SUB_32 1 +static inline int32_t pmix_atomic_sub_32(volatile int32_t *addr, int32_t delta) +{ + return __sync_sub_and_fetch(addr, delta); +} + +#if PMIX_ASM_SYNC_HAVE_64BIT + +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 +static inline int pmix_atomic_cmpset_acq_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + return __sync_bool_compare_and_swap(addr, oldval, newval); +} + +static inline int pmix_atomic_cmpset_rel_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + return __sync_bool_compare_and_swap(addr, oldval, newval);} + + +static inline int pmix_atomic_cmpset_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + return __sync_bool_compare_and_swap(addr, oldval, newval); +} + +#define PMIX_HAVE_ATOMIC_MATH_64 1 +#define PMIX_HAVE_ATOMIC_ADD_64 1 +static inline int64_t pmix_atomic_add_64(volatile int64_t *addr, int64_t delta) +{ + return __sync_add_and_fetch(addr, delta); +} + +#define PMIX_HAVE_ATOMIC_SUB_64 1 +static inline int64_t pmix_atomic_sub_64(volatile int64_t *addr, int64_t delta) +{ + return __sync_sub_and_fetch(addr, delta); +} + +#endif + +#if PMIX_HAVE_SYNC_BUILTIN_CSWAP_INT128 +static inline int pmix_atomic_cmpset_128 (volatile pmix_int128_t *addr, + pmix_int128_t oldval, pmix_int128_t newval) +{ + return __sync_bool_compare_and_swap(addr, oldval, newval); +} + +#define PMIX_HAVE_ATOMIC_CMPSET_128 1 + +#endif + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/timer.h new file mode 100644 index 00000000000..a364f61cc8f --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/timer.h @@ -0,0 +1,131 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2016 Broadcom Limited. All rights reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file + * + * Cycle counter reading instructions. Do not use directly - see the + * timer interface instead + */ + +#ifndef PMIX_SYS_TIMER_H +#define PMIX_SYS_TIMER_H 1 + +#include "pmix_config.h" + +#include "src/atomics/sys/architecture.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif + +/* do some quick #define cleanup in cases where we are doing + testing... */ +#ifdef PMIX_DISABLE_INLINE_ASM +#undef PMIX_C_GCC_INLINE_ASSEMBLY +#define PMIX_C_GCC_INLINE_ASSEMBLY 0 +#undef PMIX_CXX_GCC_INLINE_ASSEMBLY +#define PMIX_CXX_GCC_INLINE_ASSEMBLY 0 +#undef PMIX_C_DEC_INLINE_ASSEMBLY +#define PMIX_C_DEC_INLINE_ASSEMBLY 0 +#undef PMIX_CXX_DEC_INLINE_ASSEMBLY +#define PMIX_CXX_DEC_INLINE_ASSEMBLY 0 +#undef PMIX_C_XLC_INLINE_ASSEMBLY +#define PMIX_C_XLC_INLINE_ASSEMBLY 0 +#undef PMIX_CXX_XLC_INLINE_ASSEMBLY +#define PMIX_CXX_XLC_INLINE_ASSEMBLY 0 +#endif + +/* define PMIX_{GCC,DEC,XLC}_INLINE_ASSEMBLY based on the + PMIX_{C,CXX}_{GCC,DEC,XLC}_INLINE_ASSEMBLY defines and whether we + are in C or C++ */ +#if defined(c_plusplus) || defined(__cplusplus) +#define PMIX_GCC_INLINE_ASSEMBLY PMIX_CXX_GCC_INLINE_ASSEMBLY +#define PMIX_DEC_INLINE_ASSEMBLY PMIX_CXX_DEC_INLINE_ASSEMBLY +#define PMIX_XLC_INLINE_ASSEMBLY PMIX_CXX_XLC_INLINE_ASSEMBLY +#else +#define PMIX_GCC_INLINE_ASSEMBLY PMIX_C_GCC_INLINE_ASSEMBLY +#define PMIX_DEC_INLINE_ASSEMBLY PMIX_C_DEC_INLINE_ASSEMBLY +#define PMIX_XLC_INLINE_ASSEMBLY PMIX_C_XLC_INLINE_ASSEMBLY +#endif + +/********************************************************************** + * + * Load the appropriate architecture files and set some reasonable + * default values for our support + * + *********************************************************************/ + +/* By default we suppose all timers are monotonic per node. */ +#define PMIX_TIMER_MONOTONIC 1 + +BEGIN_C_DECLS + +/* If you update this list, you probably also want to update + src/mca/timer/linux/configure.m4. Or not. */ + +#if defined(DOXYGEN) +/* don't include system-level gorp when generating doxygen files */ +#elif PMIX_ASSEMBLY_ARCH == PMIX_X86_64 +#include "src/atomics/sys/x86_64/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_ARM +#include "src/atomics/sys/arm/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_ARM64 +#include "src/atomics/sys/arm64/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_IA32 +#include "src/atomics/sys/ia32/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_IA64 +#include "src/atomics/sys/ia64/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_POWERPC32 +#include "src/atomics/sys/powerpc/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_POWERPC64 +#include "src/atomics/sys/powerpc/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_SPARCV9_32 +#include "src/atomics/sys/sparcv9/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_SPARCV9_64 +#include "src/atomics/sys/sparcv9/timer.h" +#elif PMIX_ASSEMBLY_ARCH == PMIX_MIPS +#include "src/atomics/sys/mips/timer.h" +#endif + +#ifndef DOXYGEN +#ifndef PMIX_HAVE_SYS_TIMER_GET_CYCLES +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 0 + +typedef long pmix_timer_t; +#endif +#endif + +#ifndef PMIX_HAVE_SYS_TIMER_IS_MONOTONIC + +#define PMIX_HAVE_SYS_TIMER_IS_MONOTONIC 1 + +static inline bool pmix_sys_timer_is_monotonic (void) +{ + return PMIX_TIMER_MONOTONIC; +} + +#endif + +END_C_DECLS + +#endif /* PMIX_SYS_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/Makefile.include new file mode 100644 index 00000000000..79a42b8e833 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/Makefile.include @@ -0,0 +1,26 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/include/Makefile.am + +headers += \ + atomics/sys/x86_64/atomic.h \ + atomics/sys/x86_64/timer.h diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/atomic.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/atomic.h new file mode 100644 index 00000000000..aa71aae3646 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/atomic.h @@ -0,0 +1,281 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2010 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserverd. + * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2016-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef PMIX_SYS_ARCH_ATOMIC_H +#define PMIX_SYS_ARCH_ATOMIC_H 1 + +/* + * On x86_64, we use cmpxchg. + */ + + +#define PMIXSMPLOCK "lock; " +#define PMIXMB() __asm__ __volatile__("": : :"memory") + + +/********************************************************************** + * + * Define constants for AMD64 / x86_64 / EM64T / ... + * + *********************************************************************/ +#define PMIX_HAVE_ATOMIC_MEM_BARRIER 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_32 1 + +#define PMIX_HAVE_ATOMIC_CMPSET_64 1 + +/********************************************************************** + * + * Memory Barriers + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline void pmix_atomic_mb(void) +{ + PMIXMB(); +} + + +static inline void pmix_atomic_rmb(void) +{ + PMIXMB(); +} + + +static inline void pmix_atomic_wmb(void) +{ + PMIXMB(); +} + +static inline void pmix_atomic_isync(void) +{ +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +/********************************************************************** + * + * Atomic math operations + * + *********************************************************************/ +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int pmix_atomic_cmpset_32( volatile int32_t *addr, + int32_t oldval, int32_t newval) +{ + unsigned char ret; + __asm__ __volatile__ ( + PMIXSMPLOCK "cmpxchgl %3,%2 \n\t" + "sete %0 \n\t" + : "=qm" (ret), "+a" (oldval), "+m" (*addr) + : "q"(newval) + : "memory", "cc"); + + return (int)ret; +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#define pmix_atomic_cmpset_acq_32 pmix_atomic_cmpset_32 +#define pmix_atomic_cmpset_rel_32 pmix_atomic_cmpset_32 + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int pmix_atomic_cmpset_64( volatile int64_t *addr, + int64_t oldval, int64_t newval) +{ + unsigned char ret; + __asm__ __volatile__ ( + PMIXSMPLOCK "cmpxchgq %3,%2 \n\t" + "sete %0 \n\t" + : "=qm" (ret), "+a" (oldval), "+m" (*((volatile long*)addr)) + : "q"(newval) + : "memory", "cc" + ); + + return (int)ret; +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#define pmix_atomic_cmpset_acq_64 pmix_atomic_cmpset_64 +#define pmix_atomic_cmpset_rel_64 pmix_atomic_cmpset_64 + +#if PMIX_GCC_INLINE_ASSEMBLY && PMIX_HAVE_CMPXCHG16B && HAVE_PMIX_INT128_T + +static inline int pmix_atomic_cmpset_128 (volatile pmix_int128_t *addr, pmix_int128_t oldval, + pmix_int128_t newval) +{ + unsigned char ret; + + /* cmpxchg16b compares the value at the address with eax:edx (low:high). if the values are + * the same the contents of ebx:ecx are stores at the address. in all cases the value stored + * at the address is returned in eax:edx. */ + __asm__ __volatile__ (PMIXSMPLOCK "cmpxchg16b (%%rsi) \n\t" + "sete %0 \n\t" + : "=qm" (ret) + : "S" (addr), "b" (((int64_t *)&newval)[0]), "c" (((int64_t *)&newval)[1]), + "a" (((int64_t *)&oldval)[0]), "d" (((int64_t *)&oldval)[1]) + : "memory", "cc"); + + return (int) ret; +} + +#define PMIX_HAVE_ATOMIC_CMPSET_128 1 + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + +#if PMIX_GCC_INLINE_ASSEMBLY + +#define PMIX_HAVE_ATOMIC_SWAP_32 1 + +#define PMIX_HAVE_ATOMIC_SWAP_64 1 + +static inline int32_t pmix_atomic_swap_32( volatile int32_t *addr, + int32_t newval) +{ + int32_t oldval; + + __asm__ __volatile__("xchg %1, %0" : + "=r" (oldval), "+m" (*addr) : + "0" (newval) : + "memory"); + return oldval; +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#if PMIX_GCC_INLINE_ASSEMBLY + +static inline int64_t pmix_atomic_swap_64( volatile int64_t *addr, + int64_t newval) +{ + int64_t oldval; + + __asm__ __volatile__("xchgq %1, %0" : + "=r" (oldval), "+m" (*addr) : + "0" (newval) : + "memory"); + return oldval; +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + + + +#if PMIX_GCC_INLINE_ASSEMBLY + +#define PMIX_HAVE_ATOMIC_MATH_32 1 +#define PMIX_HAVE_ATOMIC_MATH_64 1 + +#define PMIX_HAVE_ATOMIC_ADD_32 1 + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type int + * + * Atomically adds @i to @v. + */ +static inline int32_t pmix_atomic_add_32(volatile int32_t* v, int i) +{ + int ret = i; + __asm__ __volatile__( + PMIXSMPLOCK "xaddl %1,%0" + :"+m" (*v), "+r" (ret) + : + :"memory", "cc" + ); + return (ret+i); +} + +#define PMIX_HAVE_ATOMIC_ADD_64 1 + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type int + * + * Atomically adds @i to @v. + */ +static inline int64_t pmix_atomic_add_64(volatile int64_t* v, int64_t i) +{ + int64_t ret = i; + __asm__ __volatile__( + PMIXSMPLOCK "xaddq %1,%0" + :"+m" (*v), "+r" (ret) + : + :"memory", "cc" + ); + return (ret+i); +} + +#define PMIX_HAVE_ATOMIC_SUB_32 1 + +/** + * atomic_sub - subtract the atomic variable + * @i: integer value to subtract + * @v: pointer of type int + * + * Atomically subtracts @i from @v. + */ +static inline int32_t pmix_atomic_sub_32(volatile int32_t* v, int i) +{ + int ret = -i; + __asm__ __volatile__( + PMIXSMPLOCK "xaddl %1,%0" + :"+m" (*v), "+r" (ret) + : + :"memory", "cc" + ); + return (ret-i); +} + +#define PMIX_HAVE_ATOMIC_SUB_64 1 + +/** + * atomic_sub - subtract the atomic variable + * @i: integer value to subtract + * @v: pointer of type int + * + * Atomically subtracts @i from @v. + */ +static inline int64_t pmix_atomic_sub_64(volatile int64_t* v, int64_t i) +{ + int64_t ret = -i; + __asm__ __volatile__( + PMIXSMPLOCK "xaddq %1,%0" + :"+m" (*v), "+r" (ret) + : + :"memory", "cc" + ); + return (ret-i); +} + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_ATOMIC_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/timer.h b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/timer.h new file mode 100644 index 00000000000..0d6019c36fc --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/atomics/sys/x86_64/timer.h @@ -0,0 +1,75 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. ALl rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_SYS_ARCH_TIMER_H +#define PMIX_SYS_ARCH_TIMER_H 1 + + +typedef uint64_t pmix_timer_t; + +/* Using RDTSC(P) results in non-monotonic timers across cores */ +#undef PMIX_TIMER_MONOTONIC +#define PMIX_TIMER_MONOTONIC 0 + +#if PMIX_GCC_INLINE_ASSEMBLY + +/* TODO: add AMD mfence version and dispatch at init */ +static inline pmix_timer_t +pmix_sys_timer_get_cycles(void) +{ + uint32_t l, h; + __asm__ __volatile__ ("lfence\n\t" + "rdtsc\n\t" + : "=a" (l), "=d" (h)); + return ((pmix_timer_t)l) | (((pmix_timer_t)h) << 32); +} + +static inline bool pmix_sys_timer_is_monotonic (void) +{ + int64_t tmp; + int32_t cpuid1, cpuid2; + const int32_t level = 0x80000007; + + /* cpuid clobbers ebx but it must be restored for -fPIC so save + * then restore ebx */ + __asm__ volatile ("xchg %%rbx, %2\n" + "cpuid\n" + "xchg %%rbx, %2\n": + "=a" (cpuid1), "=d" (cpuid2), "=r" (tmp) : + "a" (level) : + "ecx", "ebx"); + /* bit 8 of edx contains the invariant tsc flag */ + return !!(cpuid2 & (1 << 8)); +} + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 +#define PMIX_HAVE_SYS_TIMER_IS_MONOTONIC 1 + +#else + +pmix_timer_t pmix_sys_timer_get_cycles(void); + +#define PMIX_HAVE_SYS_TIMER_GET_CYCLES 1 + +#endif /* PMIX_GCC_INLINE_ASSEMBLY */ + +#endif /* ! PMIX_SYS_ARCH_TIMER_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/open_close.c b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/open_close.c index ddb48071db4..47450245547 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/open_close.c +++ b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/open_close.c @@ -75,10 +75,10 @@ static void pmix_buffer_destruct (pmix_buffer_t* buffer) } } -PMIX_CLASS_INSTANCE(pmix_buffer_t, - pmix_object_t, - pmix_buffer_construct, - pmix_buffer_destruct); +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_buffer_t, + pmix_object_t, + pmix_buffer_construct, + pmix_buffer_destruct); static void pmix_bfrop_type_info_construct(pmix_bfrop_type_info_t *obj) @@ -97,9 +97,9 @@ static void pmix_bfrop_type_info_destruct(pmix_bfrop_type_info_t *obj) } } -PMIX_CLASS_INSTANCE(pmix_bfrop_type_info_t, pmix_object_t, - pmix_bfrop_type_info_construct, - pmix_bfrop_type_info_destruct); +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_bfrop_type_info_t, pmix_object_t, + pmix_bfrop_type_info_construct, + pmix_bfrop_type_info_destruct); static void kvcon(pmix_kval_t *k) { @@ -115,18 +115,18 @@ static void kvdes(pmix_kval_t *k) PMIX_VALUE_RELEASE(k->value); } } -PMIX_CLASS_INSTANCE(pmix_kval_t, - pmix_list_item_t, - kvcon, kvdes); +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_kval_t, + pmix_list_item_t, + kvcon, kvdes); static void rcon(pmix_regex_range_t *p) { p->start = 0; p->cnt = 0; } -PMIX_CLASS_INSTANCE(pmix_regex_range_t, - pmix_list_item_t, - rcon, NULL); +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_regex_range_t, + pmix_list_item_t, + rcon, NULL); static void rvcon(pmix_regex_value_t *p) { @@ -145,9 +145,9 @@ static void rvdes(pmix_regex_value_t *p) } PMIX_LIST_DESTRUCT(&p->ranges); } -PMIX_CLASS_INSTANCE(pmix_regex_value_t, - pmix_list_item_t, - rvcon, rvdes); +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_regex_value_t, + pmix_list_item_t, + rvcon, rvdes); PMIX_EXPORT pmix_status_t pmix_bfrop_open(void) { diff --git a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/pack.c b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/pack.c index 0a562a3a25d..000be85c5bf 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/pack.c +++ b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/pack.c @@ -560,6 +560,11 @@ static pmix_status_t pack_val(pmix_buffer_t *buffer, return ret; } break; + case PMIX_POINTER: + if (PMIX_SUCCESS != (ret = pmix_bfrop_pack_buffer(buffer, &p->data.ptr, 1, PMIX_POINTER))) { + return ret; + } + break; case PMIX_SCOPE: if (PMIX_SUCCESS != (ret = pmix_bfrop_pack_buffer(buffer, &p->data.scope, 1, PMIX_SCOPE))) { return ret; diff --git a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c index 9eb10fecf0c..53e73ac1c9b 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c +++ b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c @@ -632,7 +632,7 @@ pmix_status_t pmix_bfrop_unpack_status(pmix_buffer_t *buffer, void *dest, break; case PMIX_PROC: /* this field is now a pointer, so we must allocate storage for it */ - PMIX_PROC_CREATE(val->data.proc, 1); + PMIX_PROC_CREATE(val->data.proc, m); if (NULL == val->data.proc) { return PMIX_ERR_NOMEM; } @@ -656,6 +656,11 @@ pmix_status_t pmix_bfrop_unpack_status(pmix_buffer_t *buffer, void *dest, return ret; } break; + case PMIX_POINTER: + if (PMIX_SUCCESS != (ret = pmix_bfrop_unpack_buffer(buffer, &val->data.ptr, &m, PMIX_POINTER))) { + return ret; + } + break; case PMIX_SCOPE: if (PMIX_SUCCESS != (ret = pmix_bfrop_unpack_buffer(buffer, &val->data.scope, &m, PMIX_SCOPE))) { return ret; diff --git a/opal/mca/pmix/pmix2x/pmix/src/class/pmix_pointer_array.c b/opal/mca/pmix/pmix2x/pmix/src/class/pmix_pointer_array.c index a3b3f534a43..dfd3b9a2c16 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/class/pmix_pointer_array.c +++ b/opal/mca/pmix/pmix2x/pmix/src/class/pmix_pointer_array.c @@ -3,16 +3,14 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,29 +18,23 @@ * $HEADER$ */ -#include +#include "pmix_config.h" +#include "pmix_common.h" #include #include #include -#if HAVE_STDBOOL_H -#include -#endif -#include #include "src/class/pmix_pointer_array.h" #include "src/util/output.h" -#include "include/pmix_common.h" - -enum { TABLE_INIT = 1, TABLE_GROW = 2 }; static void pmix_pointer_array_construct(pmix_pointer_array_t *); static void pmix_pointer_array_destruct(pmix_pointer_array_t *); -static bool grow_table(pmix_pointer_array_t *table, int soft, int hard); +static bool grow_table(pmix_pointer_array_t *table, int at_least); PMIX_CLASS_INSTANCE(pmix_pointer_array_t, pmix_object_t, - pmix_pointer_array_construct, - pmix_pointer_array_destruct); + pmix_pointer_array_construct, + pmix_pointer_array_destruct); /* * pmix_pointer_array constructor @@ -53,8 +45,9 @@ static void pmix_pointer_array_construct(pmix_pointer_array_t *array) array->number_free = 0; array->size = 0; array->max_size = INT_MAX; - array->block_size = 0; - array->addr = 0; + array->block_size = 8; + array->free_bits = NULL; + array->addr = NULL; } /* @@ -63,20 +56,122 @@ static void pmix_pointer_array_construct(pmix_pointer_array_t *array) static void pmix_pointer_array_destruct(pmix_pointer_array_t *array) { /* free table */ - if( NULL != array->addr) { + if( NULL != array->free_bits) { + free(array->free_bits); + array->free_bits = NULL; + } + if( NULL != array->addr ) { free(array->addr); array->addr = NULL; } array->size = 0; + } +#define TYPE_ELEM_COUNT(TYPE, CAP) (((CAP) + 8 * sizeof(TYPE) - 1) / (8 * sizeof(TYPE))) + +/** + * Translate an index position into the free bits array into 2 values, the + * index of the element and the index of the bit position. + */ +#define GET_BIT_POS(IDX, BIDX, PIDX) \ + do { \ + uint32_t __idx = (uint32_t)(IDX); \ + (BIDX) = (__idx / (8 * sizeof(uint64_t))); \ + (PIDX) = (__idx % (8 * sizeof(uint64_t))); \ + } while(0) + +/** + * A classical find first zero bit (ffs) on a large array. It checks starting + * from the indicated position until it finds a zero bit. If SET is true, + * the bit is set. The position of the bit is returned in store. + */ +#define FIND_FIRST_ZERO(START_IDX, STORE, SET) \ + do { \ + uint32_t __b_idx, __b_pos; \ + GET_BIT_POS((START_IDX), __b_idx, __b_pos); \ + for (; table->free_bits[__b_idx] == 0xFFFFFFFFFFFFFFFFULL; __b_idx++); \ + assert(__b_idx < (uint32_t)table->size); \ + uint64_t __check_value = table->free_bits[__b_idx]; \ + __b_pos = 0; \ + \ + if( 0x00000000FFFFFFFFULL == (__check_value & 0x00000000FFFFFFFFULL) ) { \ + __check_value >>= 32; __b_pos += 32; \ + } \ + if( 0x000000000000FFFFULL == (__check_value & 0x000000000000FFFFULL) ) { \ + __check_value >>= 16; __b_pos += 16; \ + } \ + if( 0x00000000000000FFULL == (__check_value & 0x00000000000000FFULL) ) { \ + __check_value >>= 8; __b_pos += 8; \ + } \ + if( 0x000000000000000FULL == (__check_value & 0x000000000000000FULL) ) { \ + __check_value >>= 4; __b_pos += 4; \ + } \ + if( 0x0000000000000003ULL == (__check_value & 0x0000000000000003ULL) ) { \ + __check_value >>= 2; __b_pos += 2; \ + } \ + if( 0x0000000000000001ULL == (__check_value & 0x0000000000000001ULL) ) { \ + __b_pos += 1; \ + } \ + if( (SET) ) { \ + table->free_bits[__b_idx] |= (1ULL << __b_pos); \ + } \ + (STORE) = (__b_idx * 8 * sizeof(uint64_t)) + __b_pos; \ + } while(0) + +/** + * Set the IDX bit in the free_bits array. The bit should be previously unset. + */ +#define SET_BIT(IDX) \ + do { \ + uint32_t __b_idx, __b_pos; \ + GET_BIT_POS((IDX), __b_idx, __b_pos); \ + assert( 0 == (table->free_bits[__b_idx] & (1UL << __b_pos))); \ + table->free_bits[__b_idx] |= (1ULL << __b_pos); \ + } while(0) + +/** + * Unset the IDX bit in the free_bits array. The bit should be previously set. + */ +#define UNSET_BIT(IDX) \ + do { \ + uint32_t __b_idx, __b_pos; \ + GET_BIT_POS((IDX), __b_idx, __b_pos); \ + assert( (table->free_bits[__b_idx] & (1UL << __b_pos))); \ + table->free_bits[__b_idx] ^= (1ULL << __b_pos); \ + } while(0) + +#if 0 +/** + * Validate the pointer array by making sure that the elements and + * the free bits array are in sync. It also check that the number + * of remaining free element is consistent. + */ +static void pmix_pointer_array_validate(pmix_pointer_array_t *array) +{ + int i, cnt = 0; + uint32_t b_idx, p_idx; + + for( i = 0; i < array->size; i++ ) { + GET_BIT_POS(i, b_idx, p_idx); + if( NULL == array->addr[i] ) { + cnt++; + assert( 0 == (array->free_bits[b_idx] & (1ULL << p_idx)) ); + } else { + assert( 0 != (array->free_bits[b_idx] & (1ULL << p_idx)) ); + } + } + assert(cnt == array->number_free); +} +#endif + /** * initialize an array object */ -pmix_status_t pmix_pointer_array_init(pmix_pointer_array_t* array, - int initial_allocation, - int max_size, int block_size) +int pmix_pointer_array_init(pmix_pointer_array_t* array, + int initial_allocation, + int max_size, int block_size) { size_t num_bytes; @@ -86,18 +181,24 @@ pmix_status_t pmix_pointer_array_init(pmix_pointer_array_t* array, } array->max_size = max_size; - array->block_size = block_size; + array->block_size = (0 == block_size ? 8 : block_size); + array->lowest_free = 0; num_bytes = (0 < initial_allocation ? initial_allocation : block_size); - array->number_free = num_bytes; - array->size = num_bytes; - num_bytes *= sizeof(void*); /* Allocate and set the array to NULL */ - array->addr = (void **)calloc(num_bytes, 1); + array->addr = (void **)calloc(num_bytes, sizeof(void*)); if (NULL == array->addr) { /* out of memory */ return PMIX_ERR_OUT_OF_RESOURCE; } + array->free_bits = (uint64_t*)calloc(TYPE_ELEM_COUNT(uint64_t, num_bytes), sizeof(uint64_t)); + if (NULL == array->free_bits) { /* out of memory */ + free(array->addr); + array->addr = NULL; + return PMIX_ERR_OUT_OF_RESOURCE; + } + array->number_free = num_bytes; + array->size = num_bytes; return PMIX_SUCCESS; } @@ -112,13 +213,11 @@ pmix_status_t pmix_pointer_array_init(pmix_pointer_array_t* array, */ int pmix_pointer_array_add(pmix_pointer_array_t *table, void *ptr) { - int i, index; + int index = table->size + 1; if (table->number_free == 0) { /* need to grow table */ - if (!grow_table(table, - (NULL == table->addr ? TABLE_INIT : table->size * TABLE_GROW), - INT_MAX)) { + if (!grow_table(table, index) ) { return PMIX_ERR_OUT_OF_RESOURCE; } } @@ -132,21 +231,19 @@ int pmix_pointer_array_add(pmix_pointer_array_t *table, void *ptr) */ index = table->lowest_free; - assert(table->addr[index] == NULL); + assert(NULL == table->addr[index]); table->addr[index] = ptr; table->number_free--; + SET_BIT(index); if (table->number_free > 0) { - for (i = table->lowest_free + 1; i < table->size; i++) { - if (table->addr[i] == NULL) { - table->lowest_free = i; - break; - } - } - } - else { + FIND_FIRST_ZERO(index, table->lowest_free, 0); + } else { table->lowest_free = table->size; } +#if 0 + pmix_pointer_array_validate(table); +#endif return index; } @@ -161,48 +258,48 @@ int pmix_pointer_array_add(pmix_pointer_array_t *table, void *ptr) * * Assumption: NULL element is free element. */ -pmix_status_t pmix_pointer_array_set_item(pmix_pointer_array_t *table, int index, - void * value) +int pmix_pointer_array_set_item(pmix_pointer_array_t *table, int index, + void * value) { assert(table != NULL); + if (PMIX_UNLIKELY(0 > index)) { + return PMIX_ERROR; + } + /* expand table if required to set a specific index */ if (table->size <= index) { - if (!grow_table(table, ((index / TABLE_GROW) + 1) * TABLE_GROW, - index)) { + if (!grow_table(table, index)) { return PMIX_ERROR; } } - + assert(table->size > index); /* mark element as free, if NULL element */ if( NULL == value ) { - if (index < table->lowest_free) { - table->lowest_free = index; - } if( NULL != table->addr[index] ) { + if (index < table->lowest_free) { + table->lowest_free = index; + } table->number_free++; + UNSET_BIT(index); } } else { if (NULL == table->addr[index]) { table->number_free--; - } - /* Reset lowest_free if required */ - if ( index == table->lowest_free ) { - int i; - - table->lowest_free = table->size; - for ( i=index + 1; isize; i++) { - if ( NULL == table->addr[i] ){ - table->lowest_free = i; - break; - } + SET_BIT(index); + /* Reset lowest_free if required */ + if ( index == table->lowest_free ) { + FIND_FIRST_ZERO(index, table->lowest_free, 0); } + } else { + assert( index != table->lowest_free ); } } table->addr[index] = value; #if 0 + pmix_pointer_array_validate(table); pmix_output(0,"pmix_pointer_array_set_item: OUT: " " table %p (size %ld, lowest free %ld, number free %ld)" " addr[%d] = %p\n", @@ -250,8 +347,7 @@ bool pmix_pointer_array_test_and_set_item (pmix_pointer_array_t *table, /* Do we need to grow the table? */ if (table->size <= index) { - if (!grow_table(table, (((index / TABLE_GROW) + 1) * TABLE_GROW), - index)) { + if (!grow_table(table, index)) { return false; } } @@ -259,22 +355,21 @@ bool pmix_pointer_array_test_and_set_item (pmix_pointer_array_t *table, /* * allow a specific index to be changed. */ + assert(NULL == table->addr[index]); table->addr[index] = value; table->number_free--; + SET_BIT(index); /* Reset lowest_free if required */ - if ( index == table->lowest_free ) { - int i; - - table->lowest_free = table->size; - for ( i=index; isize; i++) { - if ( NULL == table->addr[i] ){ - table->lowest_free = i; - break; - } + if( table->number_free > 0 ) { + if ( index == table->lowest_free ) { + FIND_FIRST_ZERO(index, table->lowest_free, 0); } + } else { + table->lowest_free = table->size; } #if 0 + pmix_pointer_array_validate(table); pmix_output(0,"pmix_pointer_array_test_and_set_item: OUT: " " table %p (size %ld, lowest free %ld, number free %ld)" " addr[%d] = %p\n", @@ -285,47 +380,55 @@ bool pmix_pointer_array_test_and_set_item (pmix_pointer_array_t *table, return true; } -pmix_status_t pmix_pointer_array_set_size(pmix_pointer_array_t *array, int new_size) +int pmix_pointer_array_set_size(pmix_pointer_array_t *array, int new_size) { if(new_size > array->size) { - if (!grow_table(array, new_size, new_size)) { + if (!grow_table(array, new_size)) { return PMIX_ERROR; } } return PMIX_SUCCESS; } -static bool grow_table(pmix_pointer_array_t *table, int soft, int hard) +static bool grow_table(pmix_pointer_array_t *table, int at_least) { - int new_size; - int i, new_size_int; + int i, new_size, new_size_int; void *p; - /* new_size = ((table->size + num_needed + table->block_size - 1) / - table->block_size) * table->block_size; */ - new_size = soft; - if( soft > table->max_size ) { - if( hard > table->max_size ) { + new_size = table->block_size * ((at_least + 1 + table->block_size - 1) / table->block_size); + if( new_size >= table->max_size ) { + new_size = table->max_size; + if( at_least >= table->max_size ) { return false; } - new_size = hard; - } - if( new_size >= table->max_size ) { - return false; } p = (void **) realloc(table->addr, new_size * sizeof(void *)); - if (p == NULL) { + if (NULL == p) { return false; } - new_size_int = (int) new_size; - table->number_free += new_size_int - table->size; + table->number_free += (new_size - table->size); table->addr = (void**)p; - for (i = table->size; i < new_size_int; ++i) { + for (i = table->size; i < new_size; ++i) { table->addr[i] = NULL; } - table->size = new_size_int; - + new_size_int = TYPE_ELEM_COUNT(uint64_t, new_size); + if( (int)(TYPE_ELEM_COUNT(uint64_t, table->size)) != new_size_int ) { + p = (uint64_t*)realloc(table->free_bits, new_size_int * sizeof(uint64_t)); + if (NULL == p) { + return false; + } + table->free_bits = (uint64_t*)p; + for (i = TYPE_ELEM_COUNT(uint64_t, table->size); + i < new_size_int; i++ ) { + table->free_bits[i] = 0; + } + } + table->size = new_size; +#if 0 + pmix_output(0, "grow_table %p to %d (max_size %d, block %d, number_free %d)\n", + (void*)table, table->size, table->max_size, table->block_size, table->number_free); +#endif return true; } diff --git a/opal/mca/pmix/pmix2x/pmix/src/class/pmix_pointer_array.h b/opal/mca/pmix/pmix2x/pmix/src/class/pmix_pointer_array.h index b3f647f89de..b369a5a0ce2 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/class/pmix_pointer_array.h +++ b/opal/mca/pmix/pmix2x/pmix/src/class/pmix_pointer_array.h @@ -3,34 +3,37 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2008 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ +/** @file + * + * Utility functions to manage fortran <-> c opaque object + * translation. Note that since MPI defines fortran handles as + * [signed] int's, we use int everywhere in here where you would + * normally expect size_t. There's some code that makes sure indices + * don't go above FORTRAN_HANDLE_MAX (which is min(INT_MAX, fortran + * INTEGER max)), just to be sure. + */ #ifndef PMIX_POINTER_ARRAY_H #define PMIX_POINTER_ARRAY_H -#include - -#if HAVE_STDBOOL_H -#include -#endif +#include "pmix_config.h" #include "src/class/pmix_object.h" -#include +#include "src/include/prefetch.h" BEGIN_C_DECLS @@ -53,6 +56,8 @@ struct pmix_pointer_array_t { int max_size; /** block size for each allocation */ int block_size; + /** pointer to an array of bits to speed up the research for an empty position. */ + uint64_t* free_bits; /** pointer to array of pointers */ void **addr; }; @@ -63,7 +68,7 @@ typedef struct pmix_pointer_array_t pmix_pointer_array_t; /** * Class declaration */ -PMIX_CLASS_DECLARATION(pmix_pointer_array_t); +PMIX_EXPORT PMIX_CLASS_DECLARATION(pmix_pointer_array_t); /** * Initialize the pointer array with an initial size of initial_allocation. @@ -79,9 +84,9 @@ PMIX_CLASS_DECLARATION(pmix_pointer_array_t); * @return PMIX_SUCCESS if all initializations were succesfull. Otherwise, * the error indicate what went wrong in the function. */ -PMIX_EXPORT pmix_status_t pmix_pointer_array_init(pmix_pointer_array_t* array, - int initial_allocation, - int max_size, int block_size ); +PMIX_EXPORT int pmix_pointer_array_init(pmix_pointer_array_t* array, + int initial_allocation, + int max_size, int block_size); /** * Add a pointer to the array (Grow the array, if need be) @@ -101,11 +106,10 @@ PMIX_EXPORT int pmix_pointer_array_add(pmix_pointer_array_t *array, void *ptr); * @param index Index of element to be reset (IN) * @param value New value to be set at element index (IN) * - * @return PMIX_SUCCESS if item was inserted. Otherwise, - * the error indicate what went wrong in the function. + * @return Error code. (-1) indicates an error. */ -PMIX_EXPORT pmix_status_t pmix_pointer_array_set_item(pmix_pointer_array_t *array, - int index, void *value); +PMIX_EXPORT int pmix_pointer_array_set_item(pmix_pointer_array_t *array, + int index, void *value); /** * Get the value of an element in array @@ -121,7 +125,7 @@ static inline void *pmix_pointer_array_get_item(pmix_pointer_array_t *table, { void *p; - if( table->size <= element_index ) { + if( PMIX_UNLIKELY(0 > element_index || table->size <= element_index) ) { return NULL; } p = table->addr[element_index]; @@ -151,13 +155,10 @@ static inline int pmix_pointer_array_get_size(pmix_pointer_array_t *array) * * @param size Desired size of the array * - * @return PMIX_SUCCESS new size was set. Otherwise, - * the error indicate what went wrong in the function. - * * Simple function to set the size of the array in order to * hide the member field from external users. */ -PMIX_EXPORT pmix_status_t pmix_pointer_array_set_size(pmix_pointer_array_t *array, int size); +PMIX_EXPORT int pmix_pointer_array_set_size(pmix_pointer_array_t *array, int size); /** * Test whether a certain element is already in use. If not yet @@ -174,8 +175,8 @@ PMIX_EXPORT pmix_status_t pmix_pointer_array_set_size(pmix_pointer_array_t *arra * a value, unless the previous value is NULL ( equiv. to free ). */ PMIX_EXPORT bool pmix_pointer_array_test_and_set_item (pmix_pointer_array_t *table, - int index, - void *value); + int index, + void *value); /** * Empty the array. @@ -191,9 +192,12 @@ static inline void pmix_pointer_array_remove_all(pmix_pointer_array_t *array) array->lowest_free = 0; array->number_free = array->size; - for(i=0; isize; i++) { + for(i = 0; i < array->size; i++) { array->addr[i] = NULL; } + for(i = 0; i < (int)((array->size + 8*sizeof(uint64_t) - 1) / (8*sizeof(uint64_t))); i++) { + array->free_bits[i] = 0; + } } END_C_DECLS diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/client/Makefile.include index e9abb45ff19..0bf6efed743 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/Makefile.include +++ b/opal/mca/pmix/pmix2x/pmix/src/client/Makefile.include @@ -22,7 +22,7 @@ sources += \ client/pmix_client_spawn.c \ client/pmix_client_connect.c -if WANT_PMIX_BACKWARD +if WANT_PMI_BACKWARD sources += \ client/pmi1.c \ client/pmi2.c diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c index a1b9546bedb..7c5953baee8 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c +++ b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c @@ -57,7 +57,7 @@ #elif PMIX_CC_USE_IDENT #ident PMIX_VERSION #endif - static const char pmix_version_string[] = PMIX_VERSION; +static const char pmix_version_string[] = PMIX_VERSION; #include "src/class/pmix_list.h" @@ -70,6 +70,7 @@ #include "src/util/output.h" #include "src/runtime/pmix_progress_threads.h" #include "src/runtime/pmix_rte.h" +#include "src/threads/threads.h" #include "src/mca/ptl/ptl.h" #include "src/include/pmix_globals.h" #if defined(PMIX_ENABLE_DSTORE) && (PMIX_ENABLE_DSTORE == 1) @@ -166,6 +167,7 @@ static void pmix_client_notify_recv(struct pmix_peer_t *peer, pmix_client_globals_t pmix_client_globals = {{{0}}}; +pmix_mutex_t pmix_client_bootstrap_mutex = PMIX_MUTEX_STATIC_INIT; /* callback for wait completion */ static void wait_cbfunc(struct pmix_peer_t *pr, @@ -236,6 +238,79 @@ static void evhandler_reg_callbk(pmix_status_t status, *active = status; } +typedef struct { + pmix_info_t *info; + size_t ninfo; +} mydata_t; + +static void release_info(pmix_status_t status, void *cbdata) +{ + mydata_t *cd = (mydata_t*)cbdata; + PMIX_INFO_FREE(cd->info, cd->ninfo); + free(cd); +} + +static void _check_for_notify(pmix_info_t info[], size_t ninfo) +{ + mydata_t *cd; + size_t n, m=0; + pmix_info_t *model=NULL, *library=NULL, *vers=NULL, *tmod=NULL; + + for (n=0; n < ninfo; n++) { + if (0 == strncmp(info[n].key, PMIX_PROGRAMMING_MODEL, PMIX_MAX_KEYLEN)) { + /* we need to generate an event indicating that + * a programming model has been declared */ + model = &info[n]; + ++m; + } else if (0 == strncmp(info[n].key, PMIX_MODEL_LIBRARY_NAME, PMIX_MAX_KEYLEN)) { + library = &info[n]; + ++m; + } else if (0 == strncmp(info[n].key, PMIX_MODEL_LIBRARY_VERSION, PMIX_MAX_KEYLEN)) { + vers = &info[n]; + ++m; + } else if (0 == strncmp(info[n].key, PMIX_THREADING_MODEL, PMIX_MAX_KEYLEN)) { + tmod = &info[n]; + ++m; + } + } + if (0 < m) { + /* notify anyone listening that a model has been declared */ + cd = (mydata_t*)malloc(sizeof(mydata_t)); + if (NULL == cd) { + /* nothing we can do */ + return; + } + PMIX_INFO_CREATE(cd->info, m+1); + if (NULL == cd->info) { + free(cd); + return; + } + cd->ninfo = m+1; + n = 0; + if (NULL != model) { + PMIX_INFO_XFER(&cd->info[n], model); + ++n; + } + if (NULL != library) { + PMIX_INFO_XFER(&cd->info[n], library); + ++n; + } + if (NULL != vers) { + PMIX_INFO_XFER(&cd->info[n], vers); + ++n; + } + if (NULL != tmod) { + PMIX_INFO_XFER(&cd->info[n], tmod); + ++n; + } + /* mark that it is not to go to any default handlers */ + PMIX_INFO_LOAD(&cd->info[n], PMIX_EVENT_NON_DEFAULT, NULL, PMIX_BOOL); + PMIx_Notify_event(PMIX_MODEL_DECLARED, + &pmix_globals.myid, PMIX_RANGE_PROC_LOCAL, + cd->info, cd->ninfo, release_info, (void*)cd); + } +} + PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, pmix_info_t info[], size_t ninfo) { @@ -255,6 +330,8 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, return PMIX_ERR_BAD_PARAM; } + pmix_mutex_lock(&pmix_client_bootstrap_mutex); + if (0 < pmix_globals.init_cntr || PMIX_PROC_SERVER == pmix_globals.proc_type) { /* since we have been called before, the nspace and * rank should be known. So return them here if @@ -263,11 +340,19 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, (void)strncpy(proc->nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN); proc->rank = pmix_globals.myid.rank; } + /* we also need to check the info keys to see if something need + * be done with them - e.g., to notify another library that we + * also have called init */ + if (NULL != info) { + _check_for_notify(info, ninfo); + } ++pmix_globals.init_cntr; + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_SUCCESS; } /* if we don't see the required info, then we cannot init */ if (NULL == getenv("PMIX_NAMESPACE")) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_INVALID_NAMESPACE; } @@ -276,6 +361,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, if (PMIX_SUCCESS != (rc = pmix_rte_init(PMIX_PROC_CLIENT, info, ninfo, pmix_client_notify_recv))) { PMIX_ERROR_LOG(rc); + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return rc; } @@ -289,6 +375,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, /* we require our nspace */ if (NULL == (evar = getenv("PMIX_NAMESPACE"))) { /* let the caller know that the server isn't available yet */ + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_INVALID_NAMESPACE; } if (NULL != proc) { @@ -302,6 +389,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, /* we also require our rank */ if (NULL == (evar = getenv("PMIX_RANK"))) { /* let the caller know that the server isn't available yet */ + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_DATA_VALUE_NOT_FOUND; } pmix_globals.myid.rank = strtol(evar, NULL, 10); @@ -315,6 +403,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, * to us at launch */ evar = getenv("PMIX_SECURITY_MODE"); if (PMIX_SUCCESS != (rc = pmix_psec.assign_module(pmix_globals.mypeer, evar))) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_INIT; } /* the server will be using the same */ @@ -323,12 +412,14 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, /* setup the shared memory support */ #if defined(PMIX_ENABLE_DSTORE) && (PMIX_ENABLE_DSTORE == 1) if (PMIX_SUCCESS != (rc = pmix_dstore_init(NULL, 0))) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_DATA_VALUE_NOT_FOUND; } #endif /* PMIX_ENABLE_DSTORE */ /* connect to the server */ if (PMIX_SUCCESS != (rc = pmix_ptl.connect_to_peer(&pmix_client_globals.myserver, info, ninfo))){ + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return rc; } @@ -339,6 +430,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(req, &cmd, 1, PMIX_CMD))) { PMIX_ERROR_LOG(rc); PMIX_RELEASE(req); + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return rc; } /* send to the server */ @@ -346,6 +438,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, cb.active = true; if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, req, job_data, (void*)&cb))){ PMIX_DESTRUCT(&cb); + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return rc; } /* wait for the data to return */ @@ -356,6 +449,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, if (PMIX_SUCCESS == rc) { pmix_globals.init_cntr++; } else { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return rc; } @@ -381,14 +475,25 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, } PMIX_INFO_DESTRUCT(&ginfo); + /* check to see if we need to notify anyone */ + if (NULL != info) { + _check_for_notify(info, ninfo); + } + + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); + return PMIX_SUCCESS; } PMIX_EXPORT int PMIx_Initialized(void) { + pmix_mutex_lock(&pmix_client_bootstrap_mutex); + if (0 < pmix_globals.init_cntr) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return true; } + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return false; } @@ -400,8 +505,10 @@ PMIX_EXPORT pmix_status_t PMIx_Finalize(const pmix_info_t info[], size_t ninfo) size_t n; volatile bool active; + pmix_mutex_lock(&pmix_client_bootstrap_mutex); if (1 != pmix_globals.init_cntr) { --pmix_globals.init_cntr; + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_SUCCESS; } pmix_globals.init_cntr = 0; @@ -409,6 +516,9 @@ PMIX_EXPORT pmix_status_t PMIx_Finalize(const pmix_info_t info[], size_t ninfo) pmix_output_verbose(2, pmix_globals.debug_output, "pmix:client finalize called"); + /* mark that I called finalize */ + pmix_globals.mypeer->finalized = true; + if ( 0 <= pmix_client_globals.myserver.sd ) { /* check to see if we are supposed to execute a * blocking fence prior to actually finalizing */ @@ -430,6 +540,7 @@ PMIX_EXPORT pmix_status_t PMIx_Finalize(const pmix_info_t info[], size_t ninfo) } } } + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); /* setup a cmd message to notify the PMIx * server that we are normally terminating */ @@ -497,14 +608,18 @@ PMIX_EXPORT pmix_status_t PMIx_Abort(int flag, const char msg[], pmix_output_verbose(2, pmix_globals.debug_output, "pmix:client abort called"); + pmix_mutex_lock(&pmix_client_bootstrap_mutex); if (pmix_globals.init_cntr <= 0) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_INIT; } /* if we aren't connected, don't attempt to send */ if (!pmix_globals.connected) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_UNREACH; } + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); /* create a buffer to hold the message */ bfr = PMIX_NEW(pmix_buffer_t); @@ -651,9 +766,12 @@ PMIX_EXPORT pmix_status_t PMIx_Put(pmix_scope_t scope, const char key[], pmix_va "pmix: executing put for key %s type %d", key, val->type); + pmix_mutex_lock(&pmix_client_bootstrap_mutex); if (pmix_globals.init_cntr <= 0) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_INIT; } + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); /* create a callback object */ cb = PMIX_NEW(pmix_cb_t); @@ -737,17 +855,22 @@ static void _commitfn(int sd, short args, void *cbdata) pmix_cb_t *cb; pmix_status_t rc; + pmix_mutex_lock(&pmix_client_bootstrap_mutex); if (pmix_globals.init_cntr <= 0) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_INIT; } /* if we are a server, or we aren't connected, don't attempt to send */ if (PMIX_PROC_SERVER == pmix_globals.proc_type) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_SUCCESS; // not an error } if (!pmix_globals.connected) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_UNREACH; } + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); /* create a callback object */ cb = PMIX_NEW(pmix_cb_t); @@ -839,9 +962,12 @@ PMIX_EXPORT pmix_status_t PMIx_Resolve_peers(const char *nodename, pmix_cb_t *cb; pmix_status_t rc; + pmix_mutex_lock(&pmix_client_bootstrap_mutex); if (pmix_globals.init_cntr <= 0) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_INIT; } + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); /* create a callback object */ cb = PMIX_NEW(pmix_cb_t); @@ -902,9 +1028,12 @@ PMIX_EXPORT pmix_status_t PMIx_Resolve_nodes(const char *nspace, char **nodelist pmix_cb_t *cb; pmix_status_t rc; + pmix_mutex_lock(&pmix_client_bootstrap_mutex); if (pmix_globals.init_cntr <= 0) { + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); return PMIX_ERR_INIT; } + pmix_mutex_unlock(&pmix_client_bootstrap_mutex); /* create a callback object */ cb = PMIX_NEW(pmix_cb_t); diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_ops.h b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_ops.h index 0de1071595e..4fdcf6c2b33 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_ops.h +++ b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_ops.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -15,6 +15,7 @@ #include "src/buffer_ops/buffer_ops.h" #include "src/class/pmix_hash_table.h" +#include "src/threads/threads.h" BEGIN_C_DECLS @@ -25,6 +26,8 @@ typedef struct { PMIX_EXPORT extern pmix_client_globals_t pmix_client_globals; +PMIX_EXPORT extern pmix_mutex_t pmix_client_bootstrap_mutex; + END_C_DECLS #endif /* PMIX_CLIENT_OPS_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/dstore/pmix_esh.c b/opal/mca/pmix/pmix2x/pmix/src/dstore/pmix_esh.c index 3884253077c..573a83d480c 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/dstore/pmix_esh.c +++ b/opal/mca/pmix/pmix2x/pmix/src/dstore/pmix_esh.c @@ -854,7 +854,13 @@ static inline void _esh_session_release(session_t *s) } _delete_sm_desc(s->sm_seg_first); - close(s->lockfd); + /* the session_t structures are initialized to zero. If + * we release the session without having actually assigned + * a locking fd, then we don't want to close that fd + * as it doesn't belong to us */ + if (0 != s->lockfd) { + close(s->lockfd); + } if (NULL != s->lockfile) { if(PMIX_PROC_SERVER == pmix_globals.proc_type) { diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c index f0ebe09269b..159100666f6 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c @@ -96,7 +96,7 @@ static pmix_status_t notify_server_of_event(pmix_status_t status, pmix_cb_t *cb; pmix_event_chain_t *chain; size_t n; - + pmix_notify_caddy_t *cd, *rbout; pmix_output_verbose(2, pmix_globals.debug_output, "client: notifying server %s:%d of status %s", @@ -106,36 +106,39 @@ static pmix_status_t notify_server_of_event(pmix_status_t status, if (!pmix_globals.connected) { return PMIX_ERR_UNREACH; } - /* create the msg object */ - msg = PMIX_NEW(pmix_buffer_t); - /* pack the command */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* pack the status */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &status, 1, PMIX_STATUS))) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* no need to pack the source as it is us */ + if (PMIX_RANGE_PROC_LOCAL != range) { + /* create the msg object */ + msg = PMIX_NEW(pmix_buffer_t); - /* pack the range */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &range, 1, PMIX_DATA_RANGE))) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* pack the info */ - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ninfo, 1, PMIX_SIZE))) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - if (0 < ninfo) { - if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, info, ninfo, PMIX_INFO))) { + /* pack the command */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &cmd, 1, PMIX_CMD))) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + /* pack the status */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &status, 1, PMIX_STATUS))) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + /* no need to pack the source as it is us */ + + /* pack the range */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &range, 1, PMIX_DATA_RANGE))) { PMIX_ERROR_LOG(rc); goto cleanup; } + /* pack the info */ + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, &ninfo, 1, PMIX_SIZE))) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + if (0 < ninfo) { + if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(msg, info, ninfo, PMIX_INFO))) { + PMIX_ERROR_LOG(rc); + goto cleanup; + } + } } /* setup for our own local callbacks */ @@ -143,8 +146,9 @@ static pmix_status_t notify_server_of_event(pmix_status_t status, chain->status = status; (void)strncpy(chain->source.nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN); chain->source.rank = pmix_globals.myid.rank; - /* we always leave space for a callback object */ - chain->ninfo = ninfo + 1; + /* we always leave space for a callback object and + * the evhandler name. */ + chain->ninfo = ninfo + 2; PMIX_INFO_CREATE(chain->info, chain->ninfo); if (0 < ninfo) { @@ -153,29 +157,86 @@ static pmix_status_t notify_server_of_event(pmix_status_t status, PMIX_INFO_XFER(&chain->info[n], &info[n]); } } + /* put the evhandler name tag in the next-to-last element - we + * will fill it in as each handler is called */ + PMIX_INFO_LOAD(&chain->info[chain->ninfo-2], PMIX_EVENT_HDLR_NAME, NULL, PMIX_STRING); /* now put the callback object tag in the last element */ - PMIX_INFO_LOAD(&chain->info[ninfo], PMIX_EVENT_RETURN_OBJECT, NULL, PMIX_POINTER); - - /* create a callback object as we need to pass it to the - * recv routine so we know which callback to use when - * the server acks/nacks the register events request*/ - cb = PMIX_NEW(pmix_cb_t); - cb->op_cbfunc = cbfunc; - cb->cbdata = cbdata; - /* send to the server */ - pmix_output_verbose(2, pmix_globals.debug_output, - "client: notifying server %s:%d - sending", - pmix_globals.myid.nspace, pmix_globals.myid.rank); - rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, notify_event_cbfunc, cb); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(cb); - goto cleanup; + PMIX_INFO_LOAD(&chain->info[chain->ninfo-1], PMIX_EVENT_RETURN_OBJECT, NULL, PMIX_POINTER); + + /* we need to cache this event so we can pass it into + * ourselves should someone later register for it */ + cd = PMIX_NEW(pmix_notify_caddy_t); + cd->status = status; + if (NULL == source) { + (void)strncpy(cd->source.nspace, "UNDEF", PMIX_MAX_NSLEN); + cd->source.rank = PMIX_RANK_UNDEF; + } else { + (void)strncpy(cd->source.nspace, source->nspace, PMIX_MAX_NSLEN); + cd->source.rank = source->rank; + } + cd->range = range; + + /* check for directives */ + if (NULL != info) { + cd->ninfo = chain->ninfo; + PMIX_INFO_CREATE(cd->info, cd->ninfo); + for (n=0; n < chain->ninfo; n++) { + PMIX_INFO_XFER(&cd->info[n], &chain->info[n]); + if (0 == strncmp(cd->info[n].key, PMIX_EVENT_NON_DEFAULT, PMIX_MAX_KEYLEN)) { + cd->nondefault = true; + } else if (0 == strncmp(cd->info[n].key, PMIX_EVENT_CUSTOM_RANGE, PMIX_MAX_KEYLEN)) { + /* provides an array of pmix_proc_t identifying the procs + * that are to receive this notification, or a single pmix_proc_t */ + if (PMIX_DATA_ARRAY == cd->info[n].value.type && + NULL != cd->info[n].value.data.darray && + NULL != cd->info[n].value.data.darray->array) { + cd->ntargets = cd->info[n].value.data.darray->size; + PMIX_PROC_CREATE(cd->targets, cd->ntargets); + memcpy(cd->targets, cd->info[n].value.data.darray->array, cd->ntargets * sizeof(pmix_proc_t)); + } else if (PMIX_PROC == cd->info[n].value.type) { + cd->ntargets = 1; + PMIX_PROC_CREATE(cd->targets, cd->ntargets); + memcpy(cd->targets, cd->info[n].value.data.proc, sizeof(pmix_proc_t)); + } else { + /* this is an error */ + PMIX_ERROR_LOG(PMIX_ERR_BAD_PARAM); + return PMIX_ERR_BAD_PARAM; + } + } + } + } + /* add to our cache */ + rbout = pmix_ring_buffer_push(&pmix_globals.notifications, cd); + /* if an older event was bumped, release it */ + if (NULL != rbout) { + PMIX_RELEASE(rbout); + } + + if (PMIX_RANGE_PROC_LOCAL != range) { + /* create a callback object as we need to pass it to the + * recv routine so we know which callback to use when + * the server acks/nacks the register events request. The + * server will _not_ send this notification back to us, + * so we handle it locally */ + cb = PMIX_NEW(pmix_cb_t); + cb->op_cbfunc = cbfunc; + cb->cbdata = cbdata; + /* send to the server */ + pmix_output_verbose(2, pmix_globals.debug_output, + "client: notifying server %s:%d - sending", + pmix_globals.myid.nspace, pmix_globals.myid.rank); + rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, msg, notify_event_cbfunc, cb); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(cb); + goto cleanup; + } + } else { + cbfunc(PMIX_SUCCESS, cbdata); } /* now notify any matching registered callbacks we have */ pmix_invoke_local_event_hdlr(chain); - PMIX_RELEASE(chain); // maintain accounting return PMIX_SUCCESS; @@ -247,7 +308,7 @@ static void progress_local_event_hdlr(pmix_status_t status, chain->nresults = cnt; /* if the caller indicates that the chain is completed, - * or we completed the "last" event, then stop here */ + * or we completed the "last" event */ if (PMIX_EVENT_ACTION_COMPLETE == status || chain->endchain) { goto complete; } @@ -263,6 +324,13 @@ static void progress_local_event_hdlr(pmix_status_t status, if (nxt->codes[0] == chain->status && check_range(&nxt->rng, &chain->source)) { chain->evhdlr = nxt; + /* add the handler name in case they want to reference it */ + if (NULL != chain->info[chain->ninfo-2].value.data.string) { + free(chain->info[chain->ninfo-2].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + } /* add any cbobject - the info struct for it is at the end */ chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; nxt->evhdlr(nxt->index, @@ -296,6 +364,13 @@ static void progress_local_event_hdlr(pmix_status_t status, * the source fits within it */ if (nxt->codes[n] == chain->status) { chain->evhdlr = nxt; + /* add the handler name in case they want to reference it */ + if (NULL != chain->info[chain->ninfo-2].value.data.string) { + free(chain->info[chain->ninfo-2].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + } /* add any cbobject - the info struct for it is at the end */ chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; nxt->evhdlr(nxt->index, @@ -323,6 +398,13 @@ static void progress_local_event_hdlr(pmix_status_t status, * the source fits within it */ if (check_range(&nxt->rng, &chain->source)) { chain->evhdlr = nxt; + /* add the handler name in case they want to reference it */ + if (NULL != chain->info[chain->ninfo-2].value.data.string) { + free(chain->info[chain->ninfo-2].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + } /* add any cbobject - the info struct for it is at the end */ chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; nxt->evhdlr(nxt->index, @@ -343,6 +425,13 @@ static void progress_local_event_hdlr(pmix_status_t status, if (1 == pmix_globals.events.last->ncodes && pmix_globals.events.last->codes[0] == chain->status) { chain->evhdlr = pmix_globals.events.last; + /* add the handler name in case they want to reference it */ + if (NULL != chain->info[chain->ninfo-2].value.data.string) { + free(chain->info[chain->ninfo-2].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + } /* add any cbobject - the info struct for it is at the end */ chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; chain->evhdlr->evhdlr(chain->evhdlr->index, @@ -356,6 +445,13 @@ static void progress_local_event_hdlr(pmix_status_t status, for (n=0; n < pmix_globals.events.last->ncodes; n++) { if (pmix_globals.events.last->codes[n] == chain->status) { chain->evhdlr = pmix_globals.events.last; + /* add the handler name in case they want to reference it */ + if (NULL != chain->info[chain->ninfo-2].value.data.string) { + free(chain->info[chain->ninfo-2].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + } /* add any cbobject - the info struct for it is at the end */ chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; chain->evhdlr->evhdlr(chain->evhdlr->index, @@ -369,6 +465,13 @@ static void progress_local_event_hdlr(pmix_status_t status, } else { /* gets run for all codes */ chain->evhdlr = pmix_globals.events.last; + /* add the handler name in case they want to reference it */ + if (NULL != chain->info[chain->ninfo-2].value.data.string) { + free(chain->info[chain->ninfo-2].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + } /* add any cbobject - the info struct for it is at the end */ chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; chain->evhdlr->evhdlr(chain->evhdlr->index, @@ -413,8 +516,9 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain) bool found; pmix_output_verbose(2, pmix_globals.debug_output, - "%s:%d invoke_local_event_hdlr", - pmix_globals.myid.nspace, pmix_globals.myid.rank); + "%s:%d invoke_local_event_hdlr for status %s", + pmix_globals.myid.nspace, pmix_globals.myid.rank, + PMIx_Error_string(chain->status)); /* sanity check */ if (NULL == chain->info) { @@ -492,19 +596,42 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain) } } - /* if they didn't want it to go to a default handler, then we are done */ - if (chain->nondefault) { - goto complete; + /* if they didn't want it to go to a default handler, then ignore them */ + if (!chain->nondefault) { + /* pass it to any default handlers */ + PMIX_LIST_FOREACH(evhdlr, &pmix_globals.events.default_events, pmix_event_hdlr_t) { + if (check_range(&evhdlr->rng, &chain->source)) { + /* invoke the handler */ + chain->evhdlr = evhdlr; + goto invk; + } + } } - /* finally, pass it to any default handlers */ - PMIX_LIST_FOREACH(evhdlr, &pmix_globals.events.default_events, pmix_event_hdlr_t) { - if (check_range(&evhdlr->rng, &chain->source)) { - /* invoke the handler */ - chain->evhdlr = evhdlr; + /* if we registered a "last" handler, and it fits the given range + * and code, then invoke it now */ + if (NULL != pmix_globals.events.last && + check_range(&pmix_globals.events.last->rng, &chain->source)) { + chain->endchain = true; // ensure we don't do this again + if (1 == pmix_globals.events.last->ncodes && + pmix_globals.events.last->codes[0] == chain->status) { + chain->evhdlr = pmix_globals.events.last; + goto invk; + } else if (NULL != pmix_globals.events.last->codes) { + /* need to check if this code is included in the array */ + for (i=0; i < pmix_globals.events.last->ncodes; i++) { + if (pmix_globals.events.last->codes[i] == chain->status) { + chain->evhdlr = pmix_globals.events.last; + goto invk; + } + } + } else { + /* gets run for all codes */ + chain->evhdlr = pmix_globals.events.last; goto invk; } } + /* if we got here, then nothing was found */ complete: /* we still have to call their final callback */ @@ -516,9 +643,18 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain) invk: /* invoke the handler */ + /* add the handler name in case they want to reference it */ + if (NULL != chain->info[chain->ninfo-2].value.data.string) { + free(chain->info[chain->ninfo-2].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + } chain->info[chain->ninfo-1].value.data.ptr = chain->evhdlr->cbobject; pmix_output_verbose(2, pmix_globals.debug_output, - "[%s:%d] INVOKING EVHDLR", __FILE__, __LINE__); + "[%s:%d] INVOKING EVHDLR %s", __FILE__, __LINE__, + (NULL == chain->evhdlr->name) ? + "NULL" : chain->evhdlr->name); chain->evhdlr->evhdlr(chain->evhdlr->index, chain->status, &chain->source, chain->info, chain->ninfo, @@ -527,6 +663,15 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain) return; } +static void local_cbfunc(pmix_status_t status, void *cbdata) +{ + pmix_notify_caddy_t *cd = (pmix_notify_caddy_t*)cbdata; + + if (NULL != cd->cbfunc) { + cd->cbfunc(status, cd->cbdata); + } + PMIX_RELEASE(cd); +} static void _notify_client_event(int sd, short args, void *cbdata) { @@ -534,8 +679,9 @@ static void _notify_client_event(int sd, short args, void *cbdata) pmix_notify_caddy_t *rbout; pmix_regevents_info_t *reginfoptr; pmix_peer_events_info_t *pr; + pmix_event_chain_t *chain; size_t n; - bool matched; + bool matched, holdcd; pmix_output_verbose(2, pmix_globals.debug_output, "pmix_server: _notify_error notifying clients of error %s", @@ -546,57 +692,102 @@ static void _notify_client_event(int sd, short args, void *cbdata) * the message until all local procs have received it, or it ages to * the point where it gets pushed out by more recent events */ PMIX_RETAIN(cd); - rbout = pmix_ring_buffer_push(&pmix_server_globals.notifications, cd); + rbout = pmix_ring_buffer_push(&pmix_globals.notifications, cd); /* if an older event was bumped, release it */ if (NULL != rbout) { PMIX_RELEASE(rbout); } - /* cycle across our registered events and send the message to - * any client who registered for it */ - PMIX_LIST_FOREACH(reginfoptr, &pmix_server_globals.events, pmix_regevents_info_t) { - if ((PMIX_MAX_ERR_CONSTANT == reginfoptr->code && !cd->nondefault) || - cd->status == reginfoptr->code) { - PMIX_LIST_FOREACH(pr, ®infoptr->peers, pmix_peer_events_info_t) { - /* if this client was the source of the event, then - * don't send it back */ - if (0 == strncmp(cd->source.nspace, pr->peer->info->nptr->nspace, PMIX_MAX_NSLEN) && - cd->source.rank == pr->peer->info->rank) { - continue; - } - /* if we were given specific targets, check if this is one */ - if (NULL != cd->targets) { - matched = false; - for (n=0; n < cd->ntargets; n++) { - if (0 != strncmp(pr->peer->info->nptr->nspace, cd->targets[n].nspace, PMIX_MAX_NSLEN)) { - continue; + holdcd = false; + if (PMIX_RANGE_PROC_LOCAL != cd->range) { + /* cycle across our registered events and send the message to + * any client who registered for it */ + PMIX_LIST_FOREACH(reginfoptr, &pmix_server_globals.events, pmix_regevents_info_t) { + if ((PMIX_MAX_ERR_CONSTANT == reginfoptr->code && !cd->nondefault) || + cd->status == reginfoptr->code) { + PMIX_LIST_FOREACH(pr, ®infoptr->peers, pmix_peer_events_info_t) { + /* if this client was the source of the event, then + * don't send it back as they will have processed it + * when they generated it */ + if (0 == strncmp(cd->source.nspace, pr->peer->info->nptr->nspace, PMIX_MAX_NSLEN) && + cd->source.rank == pr->peer->info->rank) { + continue; + } + /* if we were given specific targets, check if this is one */ + if (NULL != cd->targets) { + matched = false; + for (n=0; n < cd->ntargets; n++) { + if (0 != strncmp(pr->peer->info->nptr->nspace, cd->targets[n].nspace, PMIX_MAX_NSLEN)) { + continue; + } + if (PMIX_RANK_WILDCARD == cd->targets[n].rank || + pr->peer->info->rank == cd->targets[n].rank) { + matched = true; + break; + } } - if (PMIX_RANK_WILDCARD == cd->targets[n].rank || - pr->peer->info->rank == cd->targets[n].rank) { - matched = true; - break; + if (!matched) { + /* do not notify this one */ + continue; } } - if (!matched) { - /* do not notify this one */ - continue; - } + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix_server: notifying client %s:%d", + pr->peer->info->nptr->nspace, pr->peer->info->rank); + PMIX_RETAIN(cd->buf); + PMIX_SERVER_QUEUE_REPLY(pr->peer, 0, cd->buf); } - pmix_output_verbose(2, pmix_globals.debug_output, - "pmix_server: notifying client %s:%d", - pr->peer->info->nptr->nspace, pr->peer->info->rank); - PMIX_RETAIN(cd->buf); - PMIX_SERVER_QUEUE_REPLY(pr->peer, 0, cd->buf); } } + if (PMIX_RANGE_LOCAL != cd->range && + 0 == strncmp(cd->source.nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN) && + cd->source.rank == pmix_globals.myid.rank) { + /* if we are the source, then we need to post this upwards as + * well so the host RM can broadcast it as necessary - we rely + * on the host RM to _not_ deliver this back to us! */ + if (NULL != pmix_host_server.notify_event) { + /* mark that we sent it upstairs so we don't release + * the caddy until we return from the host RM */ + holdcd = true; + pmix_host_server.notify_event(cd->status, &cd->source, cd->range, + cd->info, cd->ninfo, local_cbfunc, cd); + } + + } } - /* notify the caller */ - if (NULL != cd->cbfunc) { - cd->cbfunc(PMIX_SUCCESS, cd->cbdata); + /* we may also have registered for events, so be sure to check this + * against our registrations */ + chain = PMIX_NEW(pmix_event_chain_t); + chain->status = cd->status; + (void)strncpy(chain->source.nspace, cd->source.nspace, PMIX_MAX_NSLEN); + chain->source.rank = cd->source.rank; + /* we always leave space for a callback object and + * the evhandler name. */ + chain->ninfo = cd->ninfo + 2; + PMIX_INFO_CREATE(chain->info, chain->ninfo); + if (0 < cd->ninfo) { + /* need to copy the info */ + for (n=0; n < cd->ninfo; n++) { + PMIX_INFO_XFER(&chain->info[n], &cd->info[n]); + } + } + /* put the evhandler name tag in the next-to-last element - we + * will fill it in as each handler is called */ + PMIX_INFO_LOAD(&chain->info[chain->ninfo-2], PMIX_EVENT_HDLR_NAME, NULL, PMIX_STRING); + /* now put the callback object tag in the last element */ + PMIX_INFO_LOAD(&chain->info[chain->ninfo-1], PMIX_EVENT_RETURN_OBJECT, NULL, PMIX_POINTER); + /* process it */ + pmix_invoke_local_event_hdlr(chain); + + if (!holdcd) { + /* notify the caller */ + if (NULL != cd->cbfunc) { + cd->cbfunc(PMIX_SUCCESS, cd->cbdata); + } + PMIX_RELEASE(cd); } - PMIX_RELEASE(cd); } @@ -776,6 +967,24 @@ static bool check_range(pmix_range_trkr_t *rng, return false; } +void pmix_event_timeout_cb(int fd, short flags, void *arg) +{ + pmix_event_chain_t *ch = (pmix_event_chain_t*)arg; + + ch->timer_active = false; + + /* remove it from the list */ + pmix_list_remove_item(&pmix_globals.cached_events, &ch->super); + + /* process this event thru the regular channels */ + if (PMIX_PROC_SERVER == pmix_globals.proc_type) { + pmix_server_notify_client_of_event(ch->status, &ch->source, + ch->range, ch->info, ch->ninfo, + ch->final_cbfunc, ch->final_cbdata); + } else { + pmix_invoke_local_event_hdlr(ch); + } +} /**** CLASS INSTANTIATIONS ****/ @@ -849,6 +1058,7 @@ PMIX_CLASS_INSTANCE(pmix_events_t, static void chcon(pmix_event_chain_t *p) { + p->timer_active = false; memset(p->source.nspace, 0, PMIX_MAX_NSLEN+1); p->source.rank = PMIX_RANK_UNDEF; p->nondefault = false; @@ -864,6 +1074,9 @@ static void chcon(pmix_event_chain_t *p) } static void chdes(pmix_event_chain_t *p) { + if (p->timer_active) { + pmix_event_del(&p->ev); + } if (NULL != p->info) { PMIX_INFO_FREE(p->info, p->ninfo); } @@ -872,5 +1085,5 @@ static void chdes(pmix_event_chain_t *p) } } PMIX_CLASS_INSTANCE(pmix_event_chain_t, - pmix_object_t, + pmix_list_item_t, chcon, chdes); diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c index 134bece6ea4..03767050182 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c @@ -234,6 +234,8 @@ static pmix_status_t _add_hdlr(pmix_rshift_caddy_t *cd, pmix_list_t *xfer) active->code = PMIX_MAX_ERR_CONSTANT; active->nregs = 1; pmix_list_append(&pmix_globals.events.actives, &active->super); + /* ensure we register it */ + need_register = true; } } else { for (n=0; n < cd->ncodes; n++) { @@ -325,20 +327,22 @@ static pmix_status_t _add_hdlr(pmix_rshift_caddy_t *cd, pmix_list_t *xfer) static void reg_event_hdlr(int sd, short args, void *cbdata) { - size_t index = 0, n; - pmix_status_t rc; pmix_rshift_caddy_t *cd = (pmix_rshift_caddy_t*)cbdata; + size_t index = 0, n, i; + pmix_status_t rc; pmix_event_hdlr_t *evhdlr, *ev; uint8_t location = PMIX_EVENT_ORDER_NONE; char *name = NULL, *locator = NULL; bool firstoverall=false, lastoverall=false; - bool found; + bool found, matched; pmix_list_t xfer; pmix_info_caddy_t *ixfer; void *cbobject = NULL; pmix_data_range_t range = PMIX_RANGE_UNDEF; pmix_proc_t *parray = NULL; size_t nprocs; + pmix_notify_caddy_t *ncd; + pmix_event_chain_t *chain; pmix_output_verbose(2, pmix_globals.debug_output, "pmix: register event_hdlr with %d infos", (int)cd->ninfo); @@ -672,6 +676,66 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) cd->evregcbfn(rc, index, cd->cbdata); } + /* check if any matching notifications have been cached */ + for (i=0; i < (size_t)pmix_globals.notifications.size; i++) { + if (NULL == (ncd = (pmix_notify_caddy_t*)pmix_ring_buffer_poke(&pmix_globals.notifications, i))) { + break; + } + found = false; + if (NULL == cd->codes) { + /* they registered a default event handler - always matches */ + found = true; + } else { + for (n=0; n < cd->ncodes; n++) { + if (cd->codes[n] == ncd->status) { + found = true; + break; + } + } + } + if (found) { + /* if we were given specific targets, check if we are one */ + if (NULL != ncd->targets) { + matched = false; + for (n=0; n < ncd->ntargets; n++) { + if (0 != strncmp(pmix_globals.myid.nspace, ncd->targets[n].nspace, PMIX_MAX_NSLEN)) { + continue; + } + if (PMIX_RANK_WILDCARD == ncd->targets[n].rank || + pmix_globals.myid.rank == ncd->targets[n].rank) { + matched = true; + break; + } + } + if (!matched) { + /* do not notify this one */ + continue; + } + } + /* all matches - notify */ + chain = PMIX_NEW(pmix_event_chain_t); + chain->status = ncd->status; + (void)strncpy(chain->source.nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN); + chain->source.rank = pmix_globals.myid.rank; + /* we already left space for evhandler name plus + * a callback object when we cached the notification */ + chain->ninfo = ncd->ninfo; + PMIX_INFO_CREATE(chain->info, chain->ninfo); + if (0 < cd->ninfo) { + /* need to copy the info */ + for (n=0; n < ncd->ninfo; n++) { + PMIX_INFO_XFER(&chain->info[n], &ncd->info[n]); + } + } + /* we don't want this chain to propagate, so indicate it + * should only be run as a single-shot */ + chain->endchain = true; + /* now notify any matching registered callbacks we have */ + pmix_invoke_local_event_hdlr(chain); + } + } + + /* all done */ PMIX_RELEASE(cd); } @@ -850,11 +914,10 @@ static void dereg_event_hdlr(int sd, short args, void *cbdata) } } /* if we get here, then the registration could not be found */ - if (NULL != cd->cbfunc.opcbfn) { - cd->cbfunc.opcbfn(PMIX_ERR_NOT_FOUND, cd->cbdata); + if (NULL != msg) { + PMIX_RELEASE(msg); } - PMIX_RELEASE(cd); - return; + goto cleanup; report: if (NULL != msg) { diff --git a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.c b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.c index bdfb143c9af..5dfbcd4d72a 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.c +++ b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.c @@ -71,6 +71,7 @@ PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_cb_t, static void pcon(pmix_peer_t *p) { + p->finalized = false; p->info = NULL; p->proc_cnt = 0; p->server_object = NULL; @@ -249,9 +250,9 @@ PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_shift_caddy_t, pmix_object_t, scon, scdes); -PMIX_CLASS_INSTANCE(pmix_info_caddy_t, - pmix_list_item_t, - NULL, NULL); +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_info_caddy_t, + pmix_list_item_t, + NULL, NULL); static void qcon(pmix_query_caddy_t *p) { @@ -280,6 +281,6 @@ static void jdcon(pmix_job_data_caddy_t *p) #endif } -PMIX_CLASS_INSTANCE(pmix_job_data_caddy_t, - pmix_object_t, - jdcon, NULL); +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_job_data_caddy_t, + pmix_object_t, + jdcon, NULL); diff --git a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.h b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.h index 1333cb24f1f..0e5548f7336 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.h +++ b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_globals.h @@ -36,6 +36,7 @@ #include "src/buffer_ops/types.h" #include "src/class/pmix_hash_table.h" #include "src/class/pmix_list.h" +#include "src/class/pmix_ring_buffer.h" #include "src/event/pmix_event.h" #include "src/mca/psec/psec.h" @@ -166,6 +167,7 @@ typedef struct pmix_personality_t { * by the socket, not the process nspace/rank */ typedef struct pmix_peer_t { pmix_object_t super; + bool finalized; pmix_rank_info_t *info; int proc_cnt; void *server_object; @@ -358,21 +360,24 @@ PMIX_CLASS_DECLARATION(pmix_info_caddy_t); * between various parts of the code library. Both the client * and server libraries must instance this structure */ typedef struct { - int init_cntr; // #times someone called Init - #times called Finalize + int init_cntr; // #times someone called Init - #times called Finalize pmix_proc_t myid; - pmix_peer_t *mypeer; // my own peer object + pmix_peer_t *mypeer; // my own peer object pmix_proc_type_t proc_type; - uid_t uid; // my effective uid - gid_t gid; // my effective gid + uid_t uid; // my effective uid + gid_t gid; // my effective gid int pindex; pmix_event_base_t *evbase; bool external_evbase; int debug_output; - pmix_events_t events; // my event handler registrations. + pmix_events_t events; // my event handler registrations. bool connected; - pmix_list_t nspaces; // list of pmix_nspace_t for the nspaces we know about - pmix_buffer_t *cache_local; // data PUT by me to local scope - pmix_buffer_t *cache_remote; // data PUT by me to remote scope + pmix_list_t nspaces; // list of pmix_nspace_t for the nspaces we know about + pmix_buffer_t *cache_local; // data PUT by me to local scope + pmix_buffer_t *cache_remote; // data PUT by me to remote scope + struct timeval event_window; + pmix_list_t cached_events; // events waiting in the window prior to processing + pmix_ring_buffer_t notifications; // ring buffer of pending notifications } pmix_globals_t; diff --git a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_stdint.h b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_stdint.h index 982a442671b..28c3099ef37 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/include/pmix_stdint.h +++ b/opal/mca/pmix/pmix2x/pmix/src/include/pmix_stdint.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,8 +10,11 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2016 IBM Corporation. All rights reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,105 +39,36 @@ #include #endif -/* 8-bit */ - -#if SIZEOF_CHAR == 1 - -#ifndef HAVE_INT8_T -typedef signed char int8_t; -#endif - -#ifndef HAVE_UINT8_T -typedef unsigned char uint8_t; -#endif - -#else - -#error Failed to define 8-bit types - -#endif - -/* 16-bit */ - -#if SIZEOF_SHORT == 2 - -#ifndef HAVE_INT16_T -typedef signed short int16_t; -#endif - -#ifndef HAVE_UINT16_T -typedef unsigned short uint16_t; -#endif - -#else - -#error Failed to define 16-bit types - -#endif - -/* 32-bit */ - -#if SIZEOF_INT == 4 +/* 128-bit */ -#ifndef HAVE_INT32_T -typedef signed int int32_t; -#endif +#ifdef HAVE_INT128_T -#ifndef HAVE_UINT32_T -typedef unsigned int uint32_t; -#endif +typedef int128_t pmix_int128_t; +typedef uint128_t pmix_uint128_t; -#elif SIZEOF_LONG == 4 +#define HAVE_PMIX_INT128_T 1 -#ifndef HAVE_INT32_T -typedef signed long int32_t; -#endif - -#ifndef HAVE_UINT32_T -typedef unsigned long uint32_t; -#endif +#elif defined(HAVE___INT128) +/* suppress warning about __int128 type */ +#pragma GCC diagnostic push +/* Clang won't quietly accept "-pedantic", but GCC versions older than ~4.8 + * won't quietly accept "-Wpedanic". The whole "#pragma GCC diagnostic ..." + * facility only was added to GCC as of version 4.6. */ +#if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 6) +#pragma GCC diagnostic ignored "-Wpedantic" #else - -#error Failed to define 32-bit types - -#endif - -/* 64-bit */ - -#if SIZEOF_INT == 8 - -#ifndef HAVE_INT64_T -typedef signed int int64_t; -#endif - -#ifndef HAVE_UINT64_T -typedef unsigned int uint64_t; -#endif - -#elif SIZEOF_LONG == 8 - -#ifndef HAVE_INT64_T -typedef signed long int64_t; -#endif - -#ifndef HAVE_UINT64_T -typedef unsigned long uint64_t; -#endif - -#elif HAVE_LONG_LONG && SIZEOF_LONG_LONG == 8 - -#ifndef HAVE_INT64_T -typedef signed long long int64_t; +#pragma GCC diagnostic ignored "-pedantic" #endif +typedef __int128 pmix_int128_t; +typedef unsigned __int128 pmix_uint128_t; +#pragma GCC diagnostic pop -#ifndef HAVE_UINT64_T -typedef unsigned long long uint64_t; -#endif +#define HAVE_PMIX_INT128_T 1 #else -#error Failed to define 64-bit types +#define HAVE_PMIX_INT128_T 0 #endif @@ -174,143 +109,8 @@ typedef unsigned long long uintptr_t; #endif -/* fix up some constants that may be missing */ -#ifndef SIZE_MAX -# if SIZEOF_VOID_P == SIZEOF_INT -# define SIZE_MAX UINT_MAX -# elif SIZEOF_VOID_P == SIZEOF_LONG -# define SIZE_MAX ULONG_MAX -# else -# error Failed to find value for SIZE_MAX -# endif -#endif /* ifndef SIZE_MAX */ - - /* inttypes.h printf specifiers */ -#ifdef HAVE_INTTYPES_H # include -#else - -# if SIZEOF_LONG == 8 -# define __PRI64_PREFIX "l" -# define __PRIPTR_PREFIX "l" -# else -# define __PRI64_PREFIX "ll" -# define __PRIPTR_PREFIX -# endif - -/* Decimal notation. */ -# define PRId8 "d" -# define PRId16 "d" -# define PRId32 "d" -# define PRId64 __PRI64_PREFIX "d" - -# define PRIdLEAST8 "d" -# define PRIdLEAST16 "d" -# define PRIdLEAST32 "d" -# define PRIdLEAST64 __PRI64_PREFIX "d" - -# define PRIdFAST8 "d" -# define PRIdFAST16 __PRIPTR_PREFIX "d" -# define PRIdFAST32 __PRIPTR_PREFIX "d" -# define PRIdFAST64 __PRI64_PREFIX "d" - -# define PRIi8 "i" -# define PRIi16 "i" -# define PRIi32 "i" -# define PRIi64 __PRI64_PREFIX "i" - -# define PRIiLEAST8 "i" -# define PRIiLEAST16 "i" -# define PRIiLEAST32 "i" -# define PRIiLEAST64 __PRI64_PREFIX "i" - -# define PRIiFAST8 "i" -# define PRIiFAST16 __PRIPTR_PREFIX "i" -# define PRIiFAST32 __PRIPTR_PREFIX "i" -# define PRIiFAST64 __PRI64_PREFIX "i" - -/* Octal notation. */ -# define PRIo8 "o" -# define PRIo16 "o" -# define PRIo32 "o" -# define PRIo64 __PRI64_PREFIX "o" - -# define PRIoLEAST8 "o" -# define PRIoLEAST16 "o" -# define PRIoLEAST32 "o" -# define PRIoLEAST64 __PRI64_PREFIX "o" - -# define PRIoFAST8 "o" -# define PRIoFAST16 __PRIPTR_PREFIX "o" -# define PRIoFAST32 __PRIPTR_PREFIX "o" -# define PRIoFAST64 __PRI64_PREFIX "o" - -/* Unsigned integers. */ -# define PRIu8 "u" -# define PRIu16 "u" -# define PRIu32 "u" -# define PRIu64 __PRI64_PREFIX "u" - -# define PRIuLEAST8 "u" -# define PRIuLEAST16 "u" -# define PRIuLEAST32 "u" -# define PRIuLEAST64 __PRI64_PREFIX "u" - -# define PRIuFAST8 "u" -# define PRIuFAST16 __PRIPTR_PREFIX "u" -# define PRIuFAST32 __PRIPTR_PREFIX "u" -# define PRIuFAST64 __PRI64_PREFIX "u" - -/* lowercase hexadecimal notation. */ -# define PRIx8 "x" -# define PRIx16 "x" -# define PRIx32 "x" -# define PRIx64 __PRI64_PREFIX "x" - -# define PRIxLEAST8 "x" -# define PRIxLEAST16 "x" -# define PRIxLEAST32 "x" -# define PRIxLEAST64 __PRI64_PREFIX "x" - -# define PRIxFAST8 "x" -# define PRIxFAST16 __PRIPTR_PREFIX "x" -# define PRIxFAST32 __PRIPTR_PREFIX "x" -# define PRIxFAST64 __PRI64_PREFIX "x" - -/* UPPERCASE hexadecimal notation. */ -# define PRIX8 "X" -# define PRIX16 "X" -# define PRIX32 "X" -# define PRIX64 __PRI64_PREFIX "X" - -# define PRIXLEAST8 "X" -# define PRIXLEAST16 "X" -# define PRIXLEAST32 "X" -# define PRIXLEAST64 __PRI64_PREFIX "X" - -# define PRIXFAST8 "X" -# define PRIXFAST16 __PRIPTR_PREFIX "X" -# define PRIXFAST32 __PRIPTR_PREFIX "X" -# define PRIXFAST64 __PRI64_PREFIX "X" - -/* Macros for printing `intmax_t' and `uintmax_t'. */ -# define PRIdMAX __PRI64_PREFIX "d" -# define PRIiMAX __PRI64_PREFIX "i" -# define PRIoMAX __PRI64_PREFIX "o" -# define PRIuMAX __PRI64_PREFIX "u" -# define PRIxMAX __PRI64_PREFIX "x" -# define PRIXMAX __PRI64_PREFIX "X" - -/* Macros for printing `intptr_t' and `uintptr_t'. */ -# define PRIdPTR __PRIPTR_PREFIX "d" -# define PRIiPTR __PRIPTR_PREFIX "i" -# define PRIoPTR __PRIPTR_PREFIX "o" -# define PRIuPTR __PRIPTR_PREFIX "u" -# define PRIxPTR __PRIPTR_PREFIX "x" -# define PRIXPTR __PRIPTR_PREFIX "X" - -#endif #ifndef PRIsize_t # if defined(ACCEPT_C99) diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c index 705d7861ab7..5301d8a0216 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c @@ -41,6 +41,7 @@ #include "src/class/pmix_pointer_array.h" #include "src/include/pmix_globals.h" +#include "src/client/pmix_client_ops.h" #include "src/server/pmix_server_ops.h" #include "src/util/error.h" @@ -137,9 +138,16 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err) break; } } - } - } - PMIX_RELEASE(peer); + } + } + if (!peer->finalized) { + /* if this peer already called finalize, then + * we are just seeing their connection go away + * when they terminate - so do not generate + * an event. If not, then we do */ + PMIX_REPORT_EVENT(err, peer, PMIX_RANGE_NAMESPACE, _notify_complete); + } + PMIX_RELEASE(peer); } else { /* if I am a client, there is only * one connection we can have */ @@ -163,8 +171,11 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err) } } PMIX_DESTRUCT(&buf); + /* if I called finalize, then don't generate an event */ + if (!pmix_globals.mypeer->finalized) { + PMIX_REPORT_EVENT(err, &pmix_client_globals.myserver, PMIX_RANGE_LOCAL, _notify_complete); + } } - PMIX_REPORT_EVENT(err, _notify_complete); } static pmix_status_t send_msg(int sd, pmix_ptl_send_t *msg) @@ -634,8 +645,8 @@ void pmix_ptl_base_process_msg(int fd, short flags, void *cbdata) * that is an error */ if (PMIX_PTL_TAG_DYNAMIC <= msg->hdr.tag) { pmix_output(0, "UNEXPECTED MESSAGE tag = %d", msg->hdr.tag); + PMIX_REPORT_EVENT(PMIX_ERROR, msg->peer, PMIX_RANGE_NAMESPACE, _notify_complete); PMIX_RELEASE(msg); - PMIX_REPORT_EVENT(PMIX_ERROR, _notify_complete); return; } diff --git a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_finalize.c b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_finalize.c index 5f2f7053628..4caeea2f56d 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_finalize.c +++ b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_finalize.c @@ -114,6 +114,8 @@ void pmix_rte_finalize(void) PMIX_RELEASE(pmix_globals.cache_remote); } PMIX_DESTRUCT(&pmix_globals.events); + PMIX_LIST_DESTRUCT(&pmix_globals.cached_events); + PMIX_DESTRUCT(&pmix_globals.notifications); /* now safe to release the event base */ if (!pmix_globals.external_evbase) { diff --git a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_init.c b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_init.c index d46ddf337d3..0249279960f 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_init.c +++ b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_init.c @@ -15,7 +15,7 @@ * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -153,6 +153,13 @@ int pmix_rte_init(pmix_proc_type_t type, memset(&pmix_globals.myid, 0, sizeof(pmix_proc_t)); PMIX_CONSTRUCT(&pmix_globals.nspaces, pmix_list_t); PMIX_CONSTRUCT(&pmix_globals.events, pmix_events_t); + pmix_globals.event_window.tv_sec = pmix_event_caching_window; + pmix_globals.event_window.tv_usec = 0; + PMIX_CONSTRUCT(&pmix_globals.cached_events, pmix_list_t); + /* construct the global notification ring buffer */ + PMIX_CONSTRUCT(&pmix_globals.notifications, pmix_ring_buffer_t); + pmix_ring_buffer_init(&pmix_globals.notifications, 256); + /* get our effective id's */ pmix_globals.uid = geteuid(); pmix_globals.gid = getegid(); diff --git a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_progress_threads.c b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_progress_threads.c index f3002445cb4..efa32eaa6b3 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_progress_threads.c +++ b/opal/mca/pmix/pmix2x/pmix/src/runtime/pmix_progress_threads.c @@ -21,53 +21,12 @@ #include PMIX_EVENT_HEADER #include "src/class/pmix_list.h" +#include "src/threads/threads.h" #include "src/util/error.h" #include "src/util/fd.h" #include "src/runtime/pmix_progress_threads.h" -/* define a thread object */ -#define PMIX_THREAD_CANCELLED ((void*)1); -typedef void *(*pmix_thread_fn_t) (pmix_object_t *); - -typedef struct pmix_thread_t { - pmix_object_t super; - pmix_thread_fn_t t_run; - void* t_arg; - pthread_t t_handle; -} pmix_thread_t; -static void ptcon(pmix_thread_t *p) -{ - p->t_arg = NULL; - p->t_handle = (pthread_t) -1; -} -PMIX_CLASS_INSTANCE(pmix_thread_t, - pmix_object_t, - ptcon, NULL); - -static int pmix_thread_start(pmix_thread_t *t) -{ - int rc; - - if (PMIX_ENABLE_DEBUG) { - if (NULL == t->t_run || t->t_handle != (pthread_t) -1) { - return PMIX_ERR_BAD_PARAM; - } - } - - rc = pthread_create(&t->t_handle, NULL, (void*(*)(void*)) t->t_run, t); - - return (rc == 0) ? PMIX_SUCCESS : PMIX_ERROR; -} - - -static int pmix_thread_join(pmix_thread_t *t, void **thr_return) -{ - int rc = pthread_join(t->t_handle, thr_return); - t->t_handle = (pthread_t) -1; - return (rc == 0) ? PMIX_SUCCESS : PMIX_ERROR; -} - /* create a tracking object for progress threads */ typedef struct { diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c index ed445a4a927..bcfe3a2c7e9 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c @@ -113,8 +113,6 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module) PMIX_CONSTRUCT(&pmix_server_globals.gdata, pmix_buffer_t); PMIX_CONSTRUCT(&pmix_server_globals.events, pmix_list_t); PMIX_CONSTRUCT(&pmix_server_globals.local_reqs, pmix_list_t); - PMIX_CONSTRUCT(&pmix_server_globals.notifications, pmix_ring_buffer_t); - pmix_ring_buffer_init(&pmix_server_globals.notifications, 256); pmix_output_verbose(2, pmix_globals.debug_output, "pmix:server init called"); @@ -261,7 +259,6 @@ PMIX_EXPORT pmix_status_t PMIx_server_finalize(void) PMIX_LIST_DESTRUCT(&pmix_server_globals.remote_pnd); PMIX_LIST_DESTRUCT(&pmix_server_globals.local_reqs); PMIX_DESTRUCT(&pmix_server_globals.gdata); - PMIX_DESTRUCT(&pmix_server_globals.notifications); PMIX_LIST_DESTRUCT(&pmix_server_globals.events); if (NULL != security_mode) { @@ -1018,7 +1015,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_dmodex_request(const pmix_proc_t *proc, } pmix_output_verbose(2, pmix_globals.debug_output, - "pmix:server register client %s:%d", + "pmix:server dmodex request%s:%d", proc->nspace, proc->rank); cd = PMIX_NEW(pmix_setup_caddy_t); @@ -2220,6 +2217,8 @@ static pmix_status_t server_switchyard(pmix_peer_t *peer, uint32_t tag, if (PMIX_FINALIZE_CMD == cmd) { pmix_output_verbose(2, pmix_globals.debug_output, "recvd FINALIZE"); + /* mark that this peer called finalize */ + peer->finalized = true; /* call the local server, if supported */ if (NULL != pmix_host_server.client_finalized) { PMIX_PEER_CADDY(cd, peer, tag); diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c index 5add656abf1..97fdd7cdfe9 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.c @@ -1160,8 +1160,8 @@ pmix_status_t pmix_server_register_events(pmix_peer_t *peer, check: /* check if any matching notifications have been cached */ - for (i=0; i < pmix_server_globals.notifications.size; i++) { - if (NULL == (cd = (pmix_notify_caddy_t*)pmix_ring_buffer_poke(&pmix_server_globals.notifications, i))) { + for (i=0; i < pmix_globals.notifications.size; i++) { + if (NULL == (cd = (pmix_notify_caddy_t*)pmix_ring_buffer_poke(&pmix_globals.notifications, i))) { break; } found = false; diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.h b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.h index f502cd33a35..f978e058b33 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.h +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server_ops.h @@ -111,7 +111,6 @@ typedef struct { pmix_list_t local_reqs; // list of pmix_dmdx_local_t awaiting arrival of data from local neighbours pmix_buffer_t gdata; // cache of data given to me for passing to all clients pmix_list_t events; // list of pmix_regevents_info_t registered events - pmix_ring_buffer_t notifications; // ring buffer of pending notifications bool tool_connections_allowed; } pmix_server_globals_t; diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/Makefile.include b/opal/mca/pmix/pmix2x/pmix/src/threads/Makefile.include new file mode 100644 index 00000000000..ba93edb67ab --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/Makefile.include @@ -0,0 +1,40 @@ +# -*- makefile -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2016 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2015 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This makefile.am does not stand on its own - it is included from pmix/Makefile.am + +# Source code files +headers += \ + threads/condition.h \ + threads/mutex.h \ + threads/mutex_unix.h \ + threads/threads.h \ + threads/tsd.h \ + threads/wait_sync.h \ + threads/thread_usage.h + +libpmix_la_SOURCES += \ + threads/condition.c \ + threads/mutex.c \ + threads/thread.c \ + threads/wait_sync.c diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/condition.c b/opal/mca/pmix/pmix2x/pmix/src/threads/condition.c new file mode 100644 index 00000000000..13a9d3ab164 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/condition.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "pmix_config.h" + +#include "src/threads/condition.h" + + +static void pmix_condition_construct(pmix_condition_t *c) +{ + c->c_waiting = 0; + c->c_signaled = 0; +} + + +static void pmix_condition_destruct(pmix_condition_t *c) +{ +} + +PMIX_CLASS_INSTANCE(pmix_condition_t, + pmix_object_t, + pmix_condition_construct, + pmix_condition_destruct); diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/condition.h b/opal/mca/pmix/pmix2x/pmix/src/threads/condition.h new file mode 100644 index 00000000000..7a18660d8f2 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/condition.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#ifndef PMIX_CONDITION_SPINLOCK_H +#define PMIX_CONDITION_SPINLOCK_H + +#include "pmix_config.h" +#ifdef HAVE_SYS_TIME_H +#include +#endif +#include +#include + +#include "src/threads/mutex.h" + +BEGIN_C_DECLS + +struct pmix_condition_t { + pmix_object_t super; + volatile int c_waiting; + volatile int c_signaled; +}; +typedef struct pmix_condition_t pmix_condition_t; + +PMIX_EXPORT PMIX_CLASS_DECLARATION(pmix_condition_t); + + +static inline int pmix_condition_wait(pmix_condition_t *c, pmix_mutex_t *m) +{ + int rc = 0; + c->c_waiting++; + + if (c->c_signaled) { + c->c_waiting--; + return 0; + } + + c->c_signaled--; + c->c_waiting--; + return rc; +} + +static inline int pmix_condition_signal(pmix_condition_t *c) +{ + if (c->c_waiting) { + c->c_signaled++; + } + return 0; +} + +static inline int pmix_condition_broadcast(pmix_condition_t *c) +{ + c->c_signaled = c->c_waiting; + return 0; +} + +END_C_DECLS + +#endif diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/mutex.c b/opal/mca/pmix/pmix2x/pmix/src/threads/mutex.c new file mode 100644 index 00000000000..d7f5e9298e8 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/mutex.c @@ -0,0 +1,94 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "pmix_config.h" + +#include "src/threads/mutex.h" + +static void pmix_mutex_construct(pmix_mutex_t *m) +{ +#if PMIX_ENABLE_DEBUG + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + + /* set type to ERRORCHECK so that we catch recursive locks */ +#if PMIX_HAVE_PTHREAD_MUTEX_ERRORCHECK_NP + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP); +#elif PMIX_HAVE_PTHREAD_MUTEX_ERRORCHECK + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK); +#endif /* PMIX_HAVE_PTHREAD_MUTEX_ERRORCHECK_NP */ + + pthread_mutex_init(&m->m_lock_pthread, &attr); + pthread_mutexattr_destroy(&attr); + + m->m_lock_debug = 0; + m->m_lock_file = NULL; + m->m_lock_line = 0; +#else + + /* Without debugging, choose the fastest available mutexes */ + pthread_mutex_init(&m->m_lock_pthread, NULL); + +#endif /* PMIX_ENABLE_DEBUG */ + +#if PMIX_HAVE_ATOMIC_SPINLOCKS + pmix_atomic_init( &m->m_lock_atomic, PMIX_ATOMIC_UNLOCKED ); +#endif +} + +static void pmix_mutex_destruct(pmix_mutex_t *m) +{ + pthread_mutex_destroy(&m->m_lock_pthread); +} + +PMIX_CLASS_INSTANCE(pmix_mutex_t, + pmix_object_t, + pmix_mutex_construct, + pmix_mutex_destruct); + +static void pmix_recursive_mutex_construct(pmix_recursive_mutex_t *m) +{ + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + +#if PMIX_ENABLE_DEBUG + m->m_lock_debug = 0; + m->m_lock_file = NULL; + m->m_lock_line = 0; +#endif + + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + + pthread_mutex_init(&m->m_lock_pthread, &attr); + pthread_mutexattr_destroy(&attr); + +#if PMIX_HAVE_ATOMIC_SPINLOCKS + pmix_atomic_init( &m->m_lock_atomic, PMIX_ATOMIC_UNLOCKED ); +#endif +} + +PMIX_CLASS_INSTANCE(pmix_recursive_mutex_t, + pmix_object_t, + pmix_recursive_mutex_construct, + pmix_mutex_destruct); diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/mutex.h b/opal/mca/pmix/pmix2x/pmix/src/threads/mutex.h new file mode 100644 index 00000000000..37a3a4c2d08 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/mutex.h @@ -0,0 +1,103 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2007 Voltaire. All rights reserved. + * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_MUTEX_H +#define PMIX_MUTEX_H 1 + +#include "pmix_config.h" + +#include "src/threads/thread_usage.h" + +BEGIN_C_DECLS + +/** + * @file: + * + * Mutual exclusion functions. + * + * Functions for locking of critical sections. + */ + +/** + * Opaque mutex object + */ +typedef struct pmix_mutex_t pmix_mutex_t; +typedef struct pmix_mutex_t pmix_recursive_mutex_t; + +/** + * Try to acquire a mutex. + * + * @param mutex Address of the mutex. + * @return 0 if the mutex was acquired, 1 otherwise. + */ +static inline int pmix_mutex_trylock(pmix_mutex_t *mutex); + + +/** + * Acquire a mutex. + * + * @param mutex Address of the mutex. + */ +static inline void pmix_mutex_lock(pmix_mutex_t *mutex); + + +/** + * Release a mutex. + * + * @param mutex Address of the mutex. + */ +static inline void pmix_mutex_unlock(pmix_mutex_t *mutex); + + +/** + * Try to acquire a mutex using atomic operations. + * + * @param mutex Address of the mutex. + * @return 0 if the mutex was acquired, 1 otherwise. + */ +static inline int pmix_mutex_atomic_trylock(pmix_mutex_t *mutex); + + +/** + * Acquire a mutex using atomic operations. + * + * @param mutex Address of the mutex. + */ +static inline void pmix_mutex_atomic_lock(pmix_mutex_t *mutex); + + +/** + * Release a mutex using atomic operations. + * + * @param mutex Address of the mutex. + */ +static inline void pmix_mutex_atomic_unlock(pmix_mutex_t *mutex); + +END_C_DECLS + +#include "mutex_unix.h" + +#endif /* PMIX_MUTEX_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/mutex_unix.h b/opal/mca/pmix/pmix2x/pmix/src/threads/mutex_unix.h new file mode 100644 index 00000000000..ffe3249040b --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/mutex_unix.h @@ -0,0 +1,215 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015-2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef PMIX_MUTEX_UNIX_H +#define PMIX_MUTEX_UNIX_H 1 + +/** + * @file: + * + * Mutual exclusion functions: Unix implementation. + * + * Functions for locking of critical sections. + * + * On unix, use pthreads or our own atomic operations as + * available. + */ + +#include "pmix_config.h" + +#include +#include +#include + +#include "src/class/pmix_object.h" +#include "src/atomics/sys/atomic.h" + +BEGIN_C_DECLS + +struct pmix_mutex_t { + pmix_object_t super; + + pthread_mutex_t m_lock_pthread; + +#if PMIX_ENABLE_DEBUG + int m_lock_debug; + const char *m_lock_file; + int m_lock_line; +#endif + + pmix_atomic_lock_t m_lock_atomic; +}; +PMIX_EXPORT PMIX_CLASS_DECLARATION(pmix_mutex_t); +PMIX_EXPORT PMIX_CLASS_DECLARATION(pmix_recursive_mutex_t); + +#if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP) +#define PMIX_PTHREAD_RECURSIVE_MUTEX_INITIALIZER PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP +#elif defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER) +#define PMIX_PTHREAD_RECURSIVE_MUTEX_INITIALIZER PTHREAD_RECURSIVE_MUTEX_INITIALIZER +#endif + +#if PMIX_ENABLE_DEBUG +#define PMIX_MUTEX_STATIC_INIT \ + { \ + .super = PMIX_OBJ_STATIC_INIT(pmix_mutex_t), \ + .m_lock_pthread = PTHREAD_MUTEX_INITIALIZER, \ + .m_lock_debug = 0, \ + .m_lock_file = NULL, \ + .m_lock_line = 0, \ + .m_lock_atomic = { .u = { .lock = PMIX_ATOMIC_UNLOCKED } }, \ + } +#else +#define PMIX_MUTEX_STATIC_INIT \ + { \ + .super = PMIX_OBJ_STATIC_INIT(pmix_mutex_t), \ + .m_lock_pthread = PTHREAD_MUTEX_INITIALIZER, \ + .m_lock_atomic = { .u = { .lock = PMIX_ATOMIC_UNLOCKED } }, \ + } +#endif + +#if defined(PMIX_PTHREAD_RECURSIVE_MUTEX_INITIALIZER) + +#if PMIX_ENABLE_DEBUG +#define PMIX_RECURSIVE_MUTEX_STATIC_INIT \ + { \ + .super = PMIX_OBJ_STATIC_INIT(pmix_mutex_t), \ + .m_lock_pthread = PMIX_PTHREAD_RECURSIVE_MUTEX_INITIALIZER, \ + .m_lock_debug = 0, \ + .m_lock_file = NULL, \ + .m_lock_line = 0, \ + .m_lock_atomic = { .u = { .lock = PMIX_ATOMIC_UNLOCKED } }, \ + } +#else +#define PMIX_RECURSIVE_MUTEX_STATIC_INIT \ + { \ + .super = PMIX_OBJ_STATIC_INIT(pmix_mutex_t), \ + .m_lock_pthread = PMIX_PTHREAD_RECURSIVE_MUTEX_INITIALIZER, \ + .m_lock_atomic = { .u = { .lock = PMIX_ATOMIC_UNLOCKED } }, \ + } +#endif + +#endif + +/************************************************************************ + * + * mutex operations (non-atomic versions) + * + ************************************************************************/ + +static inline int pmix_mutex_trylock(pmix_mutex_t *m) +{ +#if PMIX_ENABLE_DEBUG + int ret = pthread_mutex_trylock(&m->m_lock_pthread); + if (ret == EDEADLK) { + errno = ret; + perror("pmix_mutex_trylock()"); + abort(); + } + return ret; +#else + return pthread_mutex_trylock(&m->m_lock_pthread); +#endif +} + +static inline void pmix_mutex_lock(pmix_mutex_t *m) +{ +#if PMIX_ENABLE_DEBUG + int ret = pthread_mutex_lock(&m->m_lock_pthread); + if (ret == EDEADLK) { + errno = ret; + perror("pmix_mutex_lock()"); + abort(); + } +#else + pthread_mutex_lock(&m->m_lock_pthread); +#endif +} + +static inline void pmix_mutex_unlock(pmix_mutex_t *m) +{ +#if PMIX_ENABLE_DEBUG + int ret = pthread_mutex_unlock(&m->m_lock_pthread); + if (ret == EPERM) { + errno = ret; + perror("pmix_mutex_unlock"); + abort(); + } +#else + pthread_mutex_unlock(&m->m_lock_pthread); +#endif +} + +/************************************************************************ + * + * mutex operations (atomic versions) + * + ************************************************************************/ + +#if PMIX_HAVE_ATOMIC_SPINLOCKS + +/************************************************************************ + * Spin Locks + ************************************************************************/ + +static inline int pmix_mutex_atomic_trylock(pmix_mutex_t *m) +{ + return pmix_atomic_trylock(&m->m_lock_atomic); +} + +static inline void pmix_mutex_atomic_lock(pmix_mutex_t *m) +{ + pmix_atomic_lock(&m->m_lock_atomic); +} + +static inline void pmix_mutex_atomic_unlock(pmix_mutex_t *m) +{ + pmix_atomic_unlock(&m->m_lock_atomic); +} + +#else + +/************************************************************************ + * Standard locking + ************************************************************************/ + +static inline int pmix_mutex_atomic_trylock(pmix_mutex_t *m) +{ + return pmix_mutex_trylock(m); +} + +static inline void pmix_mutex_atomic_lock(pmix_mutex_t *m) +{ + pmix_mutex_lock(m); +} + +static inline void pmix_mutex_atomic_unlock(pmix_mutex_t *m) +{ + pmix_mutex_unlock(m); +} + +#endif + +END_C_DECLS + +#endif /* PMIX_MUTEX_UNIX_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/thread.c b/opal/mca/pmix/pmix2x/pmix/src/threads/thread.c new file mode 100644 index 00000000000..6513cc9e496 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/thread.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "pmix_config.h" + +#include "src/threads/threads.h" +#include "src/threads/tsd.h" +#include "pmix_common.h" + +bool pmix_debug_threads = false; + +static void pmix_thread_construct(pmix_thread_t *t); + +static pthread_t pmix_main_thread; + +struct pmix_tsd_key_value { + pmix_tsd_key_t key; + pmix_tsd_destructor_t destructor; +}; + +static struct pmix_tsd_key_value *pmix_tsd_key_values = NULL; +static int pmix_tsd_key_values_count = 0; + +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_thread_t, + pmix_object_t, + pmix_thread_construct, NULL); + + +/* + * Constructor + */ +static void pmix_thread_construct(pmix_thread_t *t) +{ + t->t_run = 0; + t->t_handle = (pthread_t) -1; +} + +int pmix_thread_start(pmix_thread_t *t) +{ + int rc; + + if (PMIX_ENABLE_DEBUG) { + if (NULL == t->t_run || t->t_handle != (pthread_t) -1) { + return PMIX_ERR_BAD_PARAM; + } + } + + rc = pthread_create(&t->t_handle, NULL, (void*(*)(void*)) t->t_run, t); + + return (rc == 0) ? PMIX_SUCCESS : PMIX_ERROR; +} + + +int pmix_thread_join(pmix_thread_t *t, void **thr_return) +{ + int rc = pthread_join(t->t_handle, thr_return); + t->t_handle = (pthread_t) -1; + return (rc == 0) ? PMIX_SUCCESS : PMIX_ERROR; +} + + +bool pmix_thread_self_compare(pmix_thread_t *t) +{ + return t->t_handle == pthread_self(); +} + + +pmix_thread_t *pmix_thread_get_self(void) +{ + pmix_thread_t *t = PMIX_NEW(pmix_thread_t); + t->t_handle = pthread_self(); + return t; +} + +void pmix_thread_kill(pmix_thread_t *t, int sig) +{ + pthread_kill(t->t_handle, sig); +} + +int pmix_tsd_key_create(pmix_tsd_key_t *key, + pmix_tsd_destructor_t destructor) +{ + int rc; + rc = pthread_key_create(key, destructor); + if ((0 == rc) && (pthread_self() == pmix_main_thread)) { + pmix_tsd_key_values = (struct pmix_tsd_key_value *)realloc(pmix_tsd_key_values, (pmix_tsd_key_values_count+1) * sizeof(struct pmix_tsd_key_value)); + pmix_tsd_key_values[pmix_tsd_key_values_count].key = *key; + pmix_tsd_key_values[pmix_tsd_key_values_count].destructor = destructor; + pmix_tsd_key_values_count ++; + } + return rc; +} + +int pmix_tsd_keys_destruct() +{ + int i; + void * ptr; + for (i=0; i +#include + +#include "src/class/pmix_object.h" +#if PMIX_ENABLE_DEBUG +#include "src/util/output.h" +#endif + +#include "mutex.h" +#include "condition.h" + +BEGIN_C_DECLS + +typedef void *(*pmix_thread_fn_t) (pmix_object_t *); + +#define PMIX_THREAD_CANCELLED ((void*)1); + +struct pmix_thread_t { + pmix_object_t super; + pmix_thread_fn_t t_run; + void* t_arg; + pthread_t t_handle; +}; + +typedef struct pmix_thread_t pmix_thread_t; + +#if PMIX_ENABLE_DEBUG +PMIX_EXPORT extern bool pmix_debug_threads; +#endif + + +PMIX_EXPORT PMIX_CLASS_DECLARATION(pmix_thread_t); + +#if PMIX_ENABLE_DEBUG +#define PMIX_ACQUIRE_THREAD(lck, cnd, act) \ + do { \ + PMIX_THREAD_LOCK((lck)); \ + if (pmix_debug_threads) { \ + pmix_output(0, "Waiting for thread %s:%d", \ + __FILE__, __LINE__); \ + } \ + while (*(act)) { \ + pmix_condition_wait((cnd), (lck)); \ + } \ + if (pmix_debug_threads) { \ + pmix_output(0, "Thread obtained %s:%d", \ + __FILE__, __LINE__); \ + } \ + *(act) = true; \ + } while(0); +#else +#define PMIX_ACQUIRE_THREAD(lck, cnd, act) \ + do { \ + PMIX_THREAD_LOCK((lck)); \ + while (*(act)) { \ + pmix_condition_wait((cnd), (lck)); \ + } \ + *(act) = true; \ + } while(0); +#endif + + +#if PMIX_ENABLE_DEBUG +#define PMIX_RELEASE_THREAD(lck, cnd, act) \ + do { \ + if (pmix_debug_threads) { \ + pmix_output(0, "Releasing thread %s:%d", \ + __FILE__, __LINE__); \ + } \ + *(act) = false; \ + pmix_condition_broadcast((cnd)); \ + PMIX_THREAD_UNLOCK((lck)); \ + } while(0); +#else +#define PMIX_RELEASE_THREAD(lck, cnd, act) \ + do { \ + *(act) = false; \ + pmix_condition_broadcast((cnd)); \ + PMIX_THREAD_UNLOCK((lck)); \ + } while(0); +#endif + + +#define PMIX_WAKEUP_THREAD(cnd, act) \ + do { \ + *(act) = false; \ + pmix_condition_broadcast((cnd)); \ + } while(0); + + +PMIX_EXPORT int pmix_thread_start(pmix_thread_t *); +PMIX_EXPORT int pmix_thread_join(pmix_thread_t *, void **thread_return); +PMIX_EXPORT bool pmix_thread_self_compare(pmix_thread_t*); +PMIX_EXPORT pmix_thread_t *pmix_thread_get_self(void); +PMIX_EXPORT void pmix_thread_kill(pmix_thread_t *, int sig); +PMIX_EXPORT void pmix_thread_set_main(void); + +END_C_DECLS + +#endif /* PMIX_THREAD_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/tsd.h b/opal/mca/pmix/pmix2x/pmix/src/threads/tsd.h new file mode 100644 index 00000000000..589027217ed --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/tsd.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#ifndef PMIX_THREADS_TSD_H +#define PMIX_THREADS_TSD_H + +#include "pmix_config.h" + +#include + +#include "pmix_common.h" + +BEGIN_C_DECLS + +/** + * @file + * + * Thread Specific Datastore Interface + * + * Functions for providing thread-specific datastore capabilities. + */ + + +/** + * Prototype for callback when tsd data is being destroyed + */ +typedef void (*pmix_tsd_destructor_t)(void *value); + +#if defined(DOXYGEN) + +/** + * Typedef for thread-specific data key + */ +typedef void* pmix_tsd_key_t; + + +/** + * Delete a thread-specific data key + * + * Delete a thread-specific data key previously returned by + * pmix_tsd_key_create(). The destructor associated with the key is + * not fired in any thread and memory cleanup is the responsibility of + * the caller. + * + * @note Unlike pthread_key_delete, this function should not be called + * from within a destructor. It can not be universally supported at + * this time. + * + * @param key[in] The key for accessing thread-specific data + * + * @retval PMIX_SUCCESS Success + * @retval EINVAL Invalid key + */ +PMIX_EXPORT int pmix_tsd_key_delete(pmix_tsd_key_t key); + + +/** + * Set a thread-specific data value + * + * Associates value with key in the current thread. The value for the + * key in other threads is not changed. Different threads may assign + * different values to the same key. + * + * @note This function should not be called within + * pmix_tsd_key_delete(). + * + * @param key[in] Thread specific data key to modify + * @param value[in] Value to associate with key + * + * @retval PMIX_SUCCESS Success + * @retval ENOMEM Insufficient memory exists to associate the + * value with the key + * @retval EINVAL Invalid key + */ +PMIX_EXPORT int pmix_tsd_setspecific(pmix_tsd_key_t key, void *value); + + +/** + * Get a thread-specific data value + * + * Get the data associated with the given key, as set by + * pmix_tsd_setspecific(). If pmix_tsd_setspecific() hasn't been + * called in the current thread with the given key, NULL is returned + * in valuep. + * + * @param key[in] Thread specific data key to modify + * @param value[out] Value to associate with key + * + * @retval PMIX_SUCCESS Success + * @retval ENOMEM Insufficient memory exists to associate the + * value with the key + * @retval EINVAL Invalid key + */ +PMIX_EXPORT int pmix_tsd_getspecific(pmix_tsd_key_t key, void **valuep); + +#else + +typedef pthread_key_t pmix_tsd_key_t; + +static inline int +pmix_tsd_key_delete(pmix_tsd_key_t key) +{ + return pthread_key_delete(key); +} + +static inline int +pmix_tsd_setspecific(pmix_tsd_key_t key, void *value) +{ + return pthread_setspecific(key, value); +} + +static inline int +pmix_tsd_getspecific(pmix_tsd_key_t key, void **valuep) +{ + *valuep = pthread_getspecific(key); + return PMIX_SUCCESS; +} + +#endif + +/** + * Create thread-specific data key + * + * Create a thread-specific data key visible to all threads in the + * current process. The returned key is valid in all threads, + * although the values bound to the key by pmix_tsd_setspecific() are + * allocated on a per-thread basis and persist for the life of the + * calling thread. + * + * Upon key creation, the value NULL is associated with the new key in + * all active threads. When a new thread is created, the value NULL + * is associated with all defined keys in the new thread. + * + * The destructor parameter may be NULL. At thread exit, if + * destructor is non-NULL AND the thread has a non-NULL value + * associated with the key, the function is called with the current + * value as its argument. + * + * @param key[out] The key for accessing thread-specific data + * @param destructor[in] Cleanup function to call when a thread exits + * + * @retval PMIX_SUCCESS Success + * @retval EAGAIN The system lacked the necessary resource to + * create another thread specific data key + * @retval ENOMEM Insufficient memory exists to create the key + */ +PMIX_EXPORT int pmix_tsd_key_create(pmix_tsd_key_t *key, + pmix_tsd_destructor_t destructor); + + +/** + * Destruct all thread-specific data keys + * + * Destruct all thread-specific data keys and invoke the destructor + * + * This should only be invoked in the main thread. + * This is made necessary since destructors are not invoked on the + * keys of the main thread, since there is no such thing as + * pthread_join(main_thread) + * + * @retval PMIX_SUCCESS Success + */ +PMIX_EXPORT int pmix_tsd_keys_destruct(void); + +END_C_DECLS + +#endif /* PMIX_MTHREADS_TSD_H */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/wait_sync.c b/opal/mca/pmix/pmix2x/pmix/src/threads/wait_sync.c new file mode 100644 index 00000000000..c825f4cb6b5 --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/wait_sync.c @@ -0,0 +1,102 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "wait_sync.h" + +static pmix_mutex_t wait_sync_lock = PMIX_MUTEX_STATIC_INIT; +static pmix_wait_sync_t* wait_sync_list = NULL; + +#define PMIX_WAIT_SYNC_PASS_OWNERSHIP(who) \ + do { \ + pthread_mutex_lock( &(who)->lock); \ + pthread_cond_signal( &(who)->condition ); \ + pthread_mutex_unlock( &(who)->lock); \ + } while(0) + +int pmix_sync_wait_mt(pmix_wait_sync_t *sync) +{ + /* Don't stop if the waiting synchronization is completed. We avoid the + * race condition around the release of the synchronization using the + * signaling field. + */ + if(sync->count <= 0) + return (0 == sync->status) ? PMIX_SUCCESS : PMIX_ERROR; + + /* lock so nobody can signal us during the list updating */ + pthread_mutex_lock(&sync->lock); + + /* Now that we hold the lock make sure another thread has not already + * call cond_signal. + */ + if(sync->count <= 0) { + pthread_mutex_unlock(&sync->lock); + return (0 == sync->status) ? PMIX_SUCCESS : PMIX_ERROR; + } + + /* Insert sync on the list of pending synchronization constructs */ + pmix_mutex_lock(&wait_sync_lock); + if( NULL == wait_sync_list ) { + sync->next = sync->prev = sync; + wait_sync_list = sync; + } else { + sync->prev = wait_sync_list->prev; + sync->prev->next = sync; + sync->next = wait_sync_list; + wait_sync_list->prev = sync; + } + pmix_mutex_unlock(&wait_sync_lock); + + /** + * If we are not responsible for progresing, go silent until something worth noticing happen: + * - this thread has been promoted to take care of the progress + * - our sync has been triggered. + */ + check_status: + if( sync != wait_sync_list ) { + pthread_cond_wait(&sync->condition, &sync->lock); + + /** + * At this point either the sync was completed in which case + * we should remove it from the wait list, or/and I was + * promoted as the progress manager. + */ + + if( sync->count <= 0 ) { /* Completed? */ + pthread_mutex_unlock(&sync->lock); + goto i_am_done; + } + /* either promoted, or spurious wakeup ! */ + goto check_status; + } + + pthread_mutex_unlock(&sync->lock); + while(sync->count > 0) { /* progress till completion */ + } + assert(sync == wait_sync_list); + + i_am_done: + /* My sync is now complete. Trim the list: remove self, wake next */ + pmix_mutex_lock(&wait_sync_lock); + sync->prev->next = sync->next; + sync->next->prev = sync->prev; + /* In case I am the progress manager, pass the duties on */ + if( sync == wait_sync_list ) { + wait_sync_list = (sync == sync->next) ? NULL : sync->next; + if( NULL != wait_sync_list ) + PMIX_WAIT_SYNC_PASS_OWNERSHIP(wait_sync_list); + } + pmix_mutex_unlock(&wait_sync_lock); + + return (0 == sync->status) ? PMIX_SUCCESS : PMIX_ERROR; +} diff --git a/opal/mca/pmix/pmix2x/pmix/src/threads/wait_sync.h b/opal/mca/pmix/pmix2x/pmix/src/threads/wait_sync.h new file mode 100644 index 00000000000..50717a96d7e --- /dev/null +++ b/opal/mca/pmix/pmix2x/pmix/src/threads/wait_sync.h @@ -0,0 +1,118 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(PMIX_THREADS_WAIT_SYNC_H) +#define PMIX_THREADS_WAIT_SYNC_H + +#include "src/atomics/sys/atomic.h" +#include "src/threads/condition.h" +#include "src/util/error.h" +#include + +BEGIN_C_DECLS + +typedef struct pmix_wait_sync_t { + int32_t count; + int32_t status; + pthread_cond_t condition; + pthread_mutex_t lock; + struct pmix_wait_sync_t *next; + struct pmix_wait_sync_t *prev; + volatile bool signaling; +} pmix_wait_sync_t; + +#define REQUEST_PENDING (void*)0L +#define REQUEST_COMPLETED (void*)1L + +#define PMIX_SYNC_WAIT(sync) sync_wait_mt (sync) + +/* The loop in release handles a race condition between the signaling + * thread and the destruction of the condition variable. The signaling + * member will be set to false after the final signaling thread has + * finished operating on the sync object. This is done to avoid + * extra atomics in the signalling function and keep it as fast + * as possible. Note that the race window is small so spinning here + * is more optimal than sleeping since this macro is called in + * the critical path. */ +#define PMIX_WAIT_SYNC_RELEASE(sync) \ + while ((sync)->signaling) { \ + continue; \ + } \ + pthread_cond_destroy(&(sync)->condition); \ + pthread_mutex_destroy(&(sync)->lock); + +#define PMIX_WAIT_SYNC_RELEASE_NOWAIT(sync) \ + pthread_cond_destroy(&(sync)->condition); \ + pthread_mutex_destroy(&(sync)->lock); + + +#define PMIX_WAIT_SYNC_SIGNAL(sync) \ + pthread_mutex_lock(&(sync->lock)); \ + pthread_cond_signal(&sync->condition); \ + pthread_mutex_unlock(&(sync->lock)); \ + sync->signaling = false; + +#define PMIX_WAIT_SYNC_SIGNALLED(sync){ \ + (sync)->signaling = false; \ +} + +PMIX_EXPORT int pmix_sync_wait_mt(pmix_wait_sync_t *sync); +static inline int pmix_sync_wait_st (pmix_wait_sync_t *sync) +{ + while (sync->count > 0) { + } + + return sync->status; +} + + +#define PMIX_WAIT_SYNC_INIT(sync,c) \ + do { \ + (sync)->count = (c); \ + (sync)->next = NULL; \ + (sync)->prev = NULL; \ + (sync)->status = 0; \ + (sync)->signaling = (0 != (c)); \ + pthread_cond_init (&(sync)->condition, NULL); \ + pthread_mutex_init (&(sync)->lock, NULL); \ + } while(0) + +/** + * Update the status of the synchronization primitive. If an error is + * reported the synchronization is completed and the signal + * triggered. The status of the synchronization will be reported to + * the waiting threads. + */ +static inline void pmix_wait_sync_update(pmix_wait_sync_t *sync, int updates, int status) +{ + if( PMIX_LIKELY(PMIX_SUCCESS == status) ) { + if( 0 != (PMIX_THREAD_ADD32(&sync->count, -updates)) ) { + return; + } + } else { + /* this is an error path so just use the atomic */ + sync->status = PMIX_ERROR; + pmix_atomic_wmb (); + pmix_atomic_swap_32 (&sync->count, 0); + } + PMIX_WAIT_SYNC_SIGNAL(sync); +} + +END_C_DECLS + +#endif /* defined(PMIX_THREADS_WAIT_SYNC_H) */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/util/error.c b/opal/mca/pmix/pmix2x/pmix/src/util/error.c index d75bc2cd783..29ee09f129b 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/util/error.c +++ b/opal/mca/pmix/pmix2x/pmix/src/util/error.c @@ -167,6 +167,8 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum) return "PMIX HEARTBEAT ALERT"; case PMIX_MONITOR_FILE_ALERT: return "PMIX FILE MONITOR ALERT"; + case PMIX_MODEL_DECLARED: + return "PMIX MODEL DECLARED"; case PMIX_SUCCESS: return "SUCCESS"; default: diff --git a/opal/mca/pmix/pmix2x/pmix/test/Makefile.am b/opal/mca/pmix/pmix2x/pmix/test/Makefile.am index 1d1a0b8f46f..ec379229652 100644 --- a/opal/mca/pmix/pmix2x/pmix/test/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/test/Makefile.am @@ -34,7 +34,7 @@ AM_CPPFLAGS = -I$(top_builddir)/src -I$(top_builddir)/src/include -I$(top_buildd noinst_SCRIPTS = pmix_client_otheruser.sh noinst_PROGRAMS = -if WANT_PMIX_BACKWARD +if WANT_PMI_BACKWARD noinst_PROGRAMS += pmi_client pmi2_client endif @@ -48,7 +48,7 @@ pmix_test_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) pmix_test_LDADD = \ $(top_builddir)/src/libpmix.la -if WANT_PMIX_BACKWARD +if WANT_PMI_BACKWARD pmi_client_SOURCES = $(headers) \ pmi_client.c pmi_client_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) diff --git a/opal/mca/pmix/pmix2x/pmix/test/simple/Makefile.am b/opal/mca/pmix/pmix2x/pmix/test/simple/Makefile.am index 8c1dfbffaf6..32f93de75ca 100644 --- a/opal/mca/pmix/pmix2x/pmix/test/simple/Makefile.am +++ b/opal/mca/pmix/pmix2x/pmix/test/simple/Makefile.am @@ -11,7 +11,7 @@ # All rights reserved. # Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012-2013 Los Alamos National Security, Inc. All rights reserved. -# Copyright (c) 2013-2016 Intel, Inc. All rights reserved +# Copyright (c) 2013-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -21,7 +21,7 @@ AM_CPPFLAGS = -I$(top_builddir)/src -I$(top_builddir)/src/include -I$(top_builddir)/include -I$(top_builddir)/include/pmix -noinst_PROGRAMS = simptest simpclient simppub simpdyn simpft simpdmodex test_pmix simptool +noinst_PROGRAMS = simptest simpclient simppub simpdyn simpft simpdmodex test_pmix simptool simpdie simptest_SOURCES = \ simptest.c @@ -70,3 +70,9 @@ simptool_SOURCES = \ simptool_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) simptool_LDADD = \ $(top_builddir)/src/libpmix.la + +simpdie_SOURCES = \ + simpdie.c +simpdie_LDFLAGS = $(PMIX_PKG_CONFIG_LDFLAGS) +simpdie_LDADD = \ + $(top_builddir)/src/libpmix.la diff --git a/opal/mca/pmix/pmix2x/pmix/test/simple/simptest.c b/opal/mca/pmix/pmix2x/pmix/test/simple/simptest.c index 528139e7626..75969651faf 100644 --- a/opal/mca/pmix/pmix2x/pmix/test/simple/simptest.c +++ b/opal/mca/pmix/pmix2x/pmix/test/simple/simptest.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. @@ -196,6 +196,54 @@ static void opcbfunc(pmix_status_t status, void *cbdata) x->active = false; } +/* this is an event notification function that we explicitly request + * be called when the PMIX_MODEL_DECLARED notification is issued. + * We could catch it in the general event notification function and test + * the status to see if the status matched, but it often is simpler + * to declare a use-specific notification callback point. In this case, + * we are asking to know whenever a model is declared as a means + * of testing server self-notification */ +static void model_callback(size_t evhdlr_registration_id, + pmix_status_t status, + const pmix_proc_t *source, + pmix_info_t info[], size_t ninfo, + pmix_info_t results[], size_t nresults, + pmix_event_notification_cbfunc_fn_t cbfunc, + void *cbdata) +{ + size_t n; + + /* just let us know it was received */ + fprintf(stderr, "Model event handler called with status %d(%s)\n", status, PMIx_Error_string(status)); + for (n=0; n < ninfo; n++) { + if (PMIX_STRING == info[n].value.type) { + fprintf(stderr, "\t%s:\t%s\n", info[n].key, info[n].value.data.string); + } + } + + /* we must NOT tell the event handler state machine that we + * are the last step as that will prevent it from notifying + * anyone else that might be listening for declarations */ + if (NULL != cbfunc) { + cbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cbdata); + } + wakeup = 0; +} + +/* event handler registration is done asynchronously */ +static void model_registration_callback(pmix_status_t status, + size_t evhandler_ref, + void *cbdata) +{ + volatile int *active = (volatile int*)cbdata; + + if (PMIX_SUCCESS != status) { + fprintf(stderr, "simptest EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n", + status, (unsigned long)evhandler_ref); + } + *active = status; +} + int main(int argc, char **argv) { char **client_env=NULL; @@ -208,9 +256,12 @@ int main(int argc, char **argv) myxfer_t *x; pmix_proc_t proc; wait_tracker_t *child; - pmix_info_t info[2]; + pmix_info_t *info; + size_t ninfo; bool cross_version = false; bool usock = true; + volatile int active; + pmix_status_t code; /* smoke test */ if (PMIX_SUCCESS != 0) { @@ -261,20 +312,46 @@ int main(int argc, char **argv) } /* setup the server library and tell it to support tool connections */ - PMIX_INFO_CONSTRUCT(&info[0]); - (void)strncpy(info[0].key, PMIX_SERVER_TOOL_SUPPORT, PMIX_MAX_KEYLEN); - PMIX_INFO_CONSTRUCT(&info[1]); + ninfo = 2; + PMIX_INFO_CREATE(info, ninfo); + PMIX_INFO_LOAD(&info[0], PMIX_SERVER_TOOL_SUPPORT, NULL, PMIX_BOOL); PMIX_INFO_LOAD(&info[1], PMIX_USOCK_DISABLE, &usock, PMIX_BOOL); if (PMIX_SUCCESS != (rc = PMIx_server_init(&mymodule, info, 2))) { fprintf(stderr, "Init failed with error %d\n", rc); return rc; } - PMIX_INFO_DESTRUCT(&info[0]); - PMIX_INFO_DESTRUCT(&info[1]); + PMIX_INFO_FREE(info, ninfo); - /* register the errhandler */ - PMIx_Register_event_handler(NULL, 0, NULL, 0, - errhandler, errhandler_reg_callbk, NULL); + /* register the default errhandler */ + active = -1; + ninfo = 1; + PMIX_INFO_CREATE(info, ninfo); + PMIX_INFO_LOAD(&info[0], PMIX_EVENT_HDLR_NAME, "SIMPTEST-DEFAULT", PMIX_STRING); + PMIx_Register_event_handler(NULL, 0, info, ninfo, + errhandler, errhandler_reg_callbk, (void*)&active); + while (-1 == active) { + usleep(10); + } + PMIX_INFO_FREE(info, ninfo); + if (0 != active) { + exit(active); + } + + /* register a handler specifically for when models declare */ + active = -1; + ninfo = 1; + PMIX_INFO_CREATE(info, ninfo); + PMIX_INFO_LOAD(&info[0], PMIX_EVENT_HDLR_NAME, "SIMPTEST-MODEL", PMIX_STRING); + code = PMIX_MODEL_DECLARED; + PMIx_Register_event_handler(&code, 1, info, ninfo, + model_callback, model_registration_callback, (void*)&active); + while (-1 == active) { + usleep(10); + } + PMIX_INFO_FREE(info, ninfo); + if (0 != active) { + exit(active); + } /* setup the pub data, in case it is used */ PMIX_CONSTRUCT(&pubdata, pmix_list_t); @@ -368,7 +445,23 @@ int main(int argc, char **argv) nanosleep(&ts, NULL); } - /* deregister the errhandler */ + /* try notifying ourselves */ + ninfo = 3; + PMIX_INFO_CREATE(info, ninfo); + PMIX_INFO_LOAD(&info[0], PMIX_PROGRAMMING_MODEL, "PMIX", PMIX_STRING); + PMIX_INFO_LOAD(&info[1], PMIX_MODEL_LIBRARY_NAME, "test", PMIX_STRING); + /* mark that it is not to go to any default handlers */ + PMIX_INFO_LOAD(&info[2], PMIX_EVENT_NON_DEFAULT, NULL, PMIX_BOOL); + wakeup = -1; + PMIx_Notify_event(PMIX_MODEL_DECLARED, + &pmix_globals.myid, PMIX_RANGE_PROC_LOCAL, + info, ninfo, NULL, NULL); + while (-1 == wakeup) { + usleep(10); + } + PMIX_INFO_FREE(info, ninfo); + + /* deregister the event handlers */ PMIx_Deregister_event_handler(0, NULL, NULL); /* release any pub data */ @@ -443,8 +536,11 @@ static void errhandler_reg_callbk (pmix_status_t status, size_t errhandler_ref, void *cbdata) { + volatile int *active = (volatile int*)cbdata; + pmix_output(0, "SERVER: ERRHANDLER REGISTRATION CALLBACK CALLED WITH STATUS %d, ref=%lu", status, (unsigned long)errhandler_ref); + *active = status; } static pmix_status_t connected(const pmix_proc_t *proc, void *server_object, From 3227e170d2fcc43eddaada924ea52615c74ec1a5 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 28 May 2017 20:51:09 -0700 Subject: [PATCH 21/29] Protect against the condition where the port string is actually NULL Signed-off-by: Ralph Castain (cherry picked from commit ed4078e2ddc63740b4b7a6e36cfb91f7160e3202) --- ompi/mca/rte/orte/rte_orte_module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/mca/rte/orte/rte_orte_module.c b/ompi/mca/rte/orte/rte_orte_module.c index 15248e82cb1..a5e2bcc4e7e 100644 --- a/ompi/mca/rte/orte/rte_orte_module.c +++ b/ompi/mca/rte/orte/rte_orte_module.c @@ -200,7 +200,7 @@ bool ompi_rte_connect_accept_support(const char *port) /* were we launched by mpirun, or are we calling * without a defined port? */ if (NULL == orte_process_info.my_hnp_uri || - 0 == strlen(port)) { + NULL == port || 0 == strlen(port)) { return true; } From 73c25a81db48a84ee5bd4211ec3520dd63405ae7 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 29 May 2017 11:25:20 -0700 Subject: [PATCH 22/29] Add some debug code for detecting leaking file descriptors. At the end of each job (and if MCA param is set), have each daemon compute the number of open fds and their characteristics and print a summary Signed-off-by: Ralph Castain (cherry picked from commit f3ab326b4ae4a7e4b7cb3f000dfee88437018788) --- orte/mca/state/base/state_base_fns.c | 107 +++++++++++++++++++++++++ orte/mca/state/base/state_base_frame.c | 19 ++++- orte/mca/state/base/state_private.h | 4 +- orte/mca/state/orted/state_orted.c | 5 ++ 4 files changed, 132 insertions(+), 3 deletions(-) diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index 38c27ba08a2..cfc258d7d15 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -13,6 +13,13 @@ #include "orte_config.h" #include "orte/constants.h" +#if HAVE_UNISTD_H +#include +#endif +#if HAVE_FCNTL_H +#include +#endif + #include "opal/class/opal_list.h" #include "opal/mca/event/event.h" #include "opal/mca/pmix/pmix.h" @@ -714,6 +721,10 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata) /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { + /* if requested, check fd status for leaks */ + if (orte_state_base_run_fdcheck) { + orte_state_base_check_fds(jdata); + } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); /* if they requested notification upon completion, provide it */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) { @@ -1016,3 +1027,99 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) OBJ_RELEASE(caddy); } + + +void orte_state_base_check_fds(orte_job_t *jdata) +{ + int nfds, i, fdflags, flflags; + char path[1024], info[256], **list=NULL, *status, *result, *r2; + ssize_t rc; + struct flock fl; + int cnt = 0; + + /* get the number of available file descriptors + * for this daemon */ + nfds = getdtablesize(); + result = NULL; + /* loop over them and get their info */ + for (i=0; i < nfds; i++) { + fdflags = fcntl(i, F_GETFD); + if (-1 == fdflags) { + /* no open fd in that slot */ + continue; + } + flflags = fcntl(i, F_GETFL); + if (-1 == flflags) { + /* no open fd in that slot */ + continue; + } + snprintf(path, 1024, "/proc/self/fd/%d", i); + memset(info, 0, 256); + /* read the info about this fd */ + rc = readlink(path, info, 256); + if (-1 == rc) { + /* this fd is unavailable */ + continue; + } + /* get any file locking status */ + fl.l_type = F_WRLCK; + fl.l_whence = 0; + fl.l_start = 0; + fl.l_len = 0; + fcntl(i, F_GETLK, &fl); + /* construct the list of capabilities */ + if (fdflags & FD_CLOEXEC) { + opal_argv_append_nosize(&list, "cloexec"); + } + if (flflags & O_APPEND) { + opal_argv_append_nosize(&list, "append"); + } + if (flflags & O_NONBLOCK) { + opal_argv_append_nosize(&list, "nonblock"); + } + if (flflags & O_RDONLY) { + opal_argv_append_nosize(&list, "rdonly"); + } + if (flflags & O_RDWR) { + opal_argv_append_nosize(&list, "rdwr"); + } + if (flflags & O_WRONLY) { + opal_argv_append_nosize(&list, "wronly"); + } + if (flflags & O_DSYNC) { + opal_argv_append_nosize(&list, "dsync"); + } + if (flflags & O_RSYNC) { + opal_argv_append_nosize(&list, "rsync"); + } + if (flflags & O_SYNC) { + opal_argv_append_nosize(&list, "sync"); + } + if (F_UNLCK != fl.l_type) { + if (F_WRLCK == fl.l_type) { + opal_argv_append_nosize(&list, "wrlock"); + } else { + opal_argv_append_nosize(&list, "rdlock"); + } + } + if (NULL != list) { + status = opal_argv_join(list, ' '); + opal_argv_free(list); + list = NULL; + if (NULL == result) { + asprintf(&result, " %d\t(%s)\t%s\n", i, info, status); + } else { + asprintf(&r2, "%s %d\t(%s)\t%s\n", result, i, info, status); + free(result); + result = r2; + } + free(status); + } + ++cnt; + } + asprintf(&r2, "%s: %d open file descriptors after job %d completed\n%s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cnt, ORTE_LOCAL_JOBID(jdata->jobid), result); + opal_output(0, "%s", r2); + free(result); + free(r2); +} diff --git a/orte/mca/state/base/state_base_frame.c b/orte/mca/state/base/state_base_frame.c index 3838d901dd9..74c009d46fc 100644 --- a/orte/mca/state/base/state_base_frame.c +++ b/orte/mca/state/base/state_base_frame.c @@ -4,6 +4,7 @@ * All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,6 +42,20 @@ * Globals */ orte_state_base_module_t orte_state = {0}; +bool orte_state_base_run_fdcheck = false; + +static int orte_state_base_register(mca_base_register_flag_t flags) +{ + orte_state_base_run_fdcheck = false; + mca_base_var_register("orte", "state", "base", "check_fds", + "Daemons should check fds for leaks after each job completes", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &orte_state_base_run_fdcheck); + + return ORTE_SUCCESS; +} static int orte_state_base_close(void) { @@ -62,7 +77,8 @@ static int orte_state_base_open(mca_base_open_flag_t flags) return mca_base_framework_components_open(&orte_state_base_framework, flags); } -MCA_BASE_FRAMEWORK_DECLARE(orte, state, "ORTE State Machine", NULL, +MCA_BASE_FRAMEWORK_DECLARE(orte, state, "ORTE State Machine", + orte_state_base_register, orte_state_base_open, orte_state_base_close, mca_state_base_static_components, 0); @@ -95,4 +111,3 @@ OBJ_CLASS_INSTANCE(orte_state_caddy_t, opal_object_t, orte_state_caddy_construct, orte_state_caddy_destruct); - diff --git a/orte/mca/state/base/state_private.h b/orte/mca/state/base/state_private.h index 0c9db094ad6..3ba3bcc1dde 100644 --- a/orte/mca/state/base/state_private.h +++ b/orte/mca/state/base/state_private.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,6 +32,7 @@ BEGIN_C_DECLS +extern bool orte_state_base_run_fdcheck; /* * Base functions */ @@ -75,7 +77,7 @@ ORTE_DECLSPEC void orte_state_base_cleanup_job(int fd, short argc, void *cbdata) ORTE_DECLSPEC void orte_state_base_report_progress(int fd, short argc, void *cbdata); ORTE_DECLSPEC void orte_state_base_track_procs(int fd, short argc, void *cbdata); ORTE_DECLSPEC void orte_state_base_check_all_complete(int fd, short args, void *cbdata); - +ORTE_DECLSPEC void orte_state_base_check_fds(orte_job_t *jdata); END_C_DECLS #endif diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c index 708d69fca2f..55ad8082e17 100644 --- a/orte/mca/state/orted/state_orted.c +++ b/orte/mca/state/orted/state_orted.c @@ -484,6 +484,11 @@ static void track_procs(int fd, short argc, void *cbdata) jdata->map = NULL; } + /* if requested, check fd status for leaks */ + if (orte_state_base_run_fdcheck) { + orte_state_base_check_fds(jdata); + } + /* cleanup the job info */ opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL); OBJ_RELEASE(jdata); From 7a4418a79e28a47275a88995bfa451cc9e157c34 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 30 May 2017 09:43:01 -0700 Subject: [PATCH 23/29] Ensure that data from a job that was stored in ompi-server is purged once that job completes. Cleanup a few typos. Silence a Coverity warning Signed-off-by: Ralph Castain (cherry picked from commit 9a8811a2460c1ab4551fa84d5fc1020b87716bab) --- orte/mca/state/base/state_base_fns.c | 78 +++++++++++++++++++++++--- orte/mca/state/base/state_private.h | 1 + orte/mca/state/orted/state_orted.c | 11 ++++ orte/orted/pmix/pmix_server.c | 1 + orte/orted/pmix/pmix_server_internal.h | 4 +- orte/orted/pmix/pmix_server_pub.c | 14 ++--- orte/runtime/orte_data_server.c | 40 +++++++++++++ orte/runtime/orte_data_server.h | 10 ++-- 8 files changed, 137 insertions(+), 22 deletions(-) diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index cfc258d7d15..298e77b0379 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -24,6 +24,8 @@ #include "opal/mca/event/event.h" #include "opal/mca/pmix/pmix.h" +#include "orte/orted/pmix/pmix_server_internal.h" +#include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" #include "orte/mca/errmgr/errmgr.h" @@ -466,6 +468,50 @@ void orte_state_base_report_progress(int fd, short argc, void *cbdata) OBJ_RELEASE(caddy); } +void orte_state_base_notify_data_server(orte_process_name_t *target) +{ + opal_buffer_t *buf; + int rc, room = -1; + uint8_t cmd = ORTE_PMIX_PURGE_PROC_CMD; + + /* if nobody local to us published anything, then we can ignore this */ + if (ORTE_JOBID_INVALID == orte_pmix_server_globals.server.jobid) { + return; + } + + buf = OBJ_NEW(opal_buffer_t); + + /* pack the room number */ + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &room, 1, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } + + /* load the command */ + if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &cmd, 1, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } + + /* provide the target */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, target, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buf); + return; + } + + /* send the request to the server */ + rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, + &orte_pmix_server_globals.server, buf, + ORTE_RML_TAG_DATA_SERVER, + orte_rml_send_callback, NULL); + if (ORTE_SUCCESS != rc) { + OBJ_RELEASE(buf); + } +} + static void _send_notification(int status, orte_proc_state_t state, orte_process_name_t *proc, @@ -725,6 +771,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata) if (orte_state_base_run_fdcheck) { orte_state_base_check_fds(jdata); } + /* if ompi-server is around, then notify it to purge + * any session-related info */ + if (NULL != orte_data_server_uri) { + target.jobid = jdata->jobid; + target.vpid = ORTE_VPID_WILDCARD; + orte_state_base_notify_data_server(&target); + } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); /* if they requested notification upon completion, provide it */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) { @@ -1035,6 +1088,7 @@ void orte_state_base_check_fds(orte_job_t *jdata) char path[1024], info[256], **list=NULL, *status, *result, *r2; ssize_t rc; struct flock fl; + bool flk; int cnt = 0; /* get the number of available file descriptors @@ -1066,7 +1120,11 @@ void orte_state_base_check_fds(orte_job_t *jdata) fl.l_whence = 0; fl.l_start = 0; fl.l_len = 0; - fcntl(i, F_GETLK, &fl); + if (-1 == fcntl(i, F_GETLK, &fl)) { + flk = false; + } else { + flk = true; + } /* construct the list of capabilities */ if (fdflags & FD_CLOEXEC) { opal_argv_append_nosize(&list, "cloexec"); @@ -1077,14 +1135,18 @@ void orte_state_base_check_fds(orte_job_t *jdata) if (flflags & O_NONBLOCK) { opal_argv_append_nosize(&list, "nonblock"); } - if (flflags & O_RDONLY) { + /* from the man page: + * Unlike the other values that can be specified in flags, + * the access mode values O_RDONLY, O_WRONLY, and O_RDWR, + * do not specify individual bits. Rather, they define + * the low order two bits of flags, and defined respectively + * as 0, 1, and 2. */ + if (O_RDONLY == (flflags & 3)) { opal_argv_append_nosize(&list, "rdonly"); - } - if (flflags & O_RDWR) { - opal_argv_append_nosize(&list, "rdwr"); - } - if (flflags & O_WRONLY) { + } else if (O_WRONLY == (flflags & 3)) { opal_argv_append_nosize(&list, "wronly"); + } else { + opal_argv_append_nosize(&list, "rdwr"); } if (flflags & O_DSYNC) { opal_argv_append_nosize(&list, "dsync"); @@ -1095,7 +1157,7 @@ void orte_state_base_check_fds(orte_job_t *jdata) if (flflags & O_SYNC) { opal_argv_append_nosize(&list, "sync"); } - if (F_UNLCK != fl.l_type) { + if (flk && F_UNLCK != fl.l_type) { if (F_WRLCK == fl.l_type) { opal_argv_append_nosize(&list, "wrlock"); } else { diff --git a/orte/mca/state/base/state_private.h b/orte/mca/state/base/state_private.h index 3ba3bcc1dde..1e63eeac63f 100644 --- a/orte/mca/state/base/state_private.h +++ b/orte/mca/state/base/state_private.h @@ -78,6 +78,7 @@ ORTE_DECLSPEC void orte_state_base_report_progress(int fd, short argc, void *cbd ORTE_DECLSPEC void orte_state_base_track_procs(int fd, short argc, void *cbdata); ORTE_DECLSPEC void orte_state_base_check_all_complete(int fd, short args, void *cbdata); ORTE_DECLSPEC void orte_state_base_check_fds(orte_job_t *jdata); +ORTE_DECLSPEC void orte_state_base_notify_data_server(orte_process_name_t *target); END_C_DECLS #endif diff --git a/orte/mca/state/orted/state_orted.c b/orte/mca/state/orted/state_orted.c index 55ad8082e17..1c9243b3a42 100644 --- a/orte/mca/state/orted/state_orted.c +++ b/orte/mca/state/orted/state_orted.c @@ -27,6 +27,8 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/routed/routed.h" #include "orte/util/session_dir.h" +#include "orte/orted/pmix/pmix_server_internal.h" +#include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_quit.h" #include "orte/mca/state/state.h" @@ -260,6 +262,7 @@ static void track_procs(int fd, short argc, void *cbdata) orte_std_cntr_t index; orte_job_map_t *map; orte_node_t *node; + orte_process_name_t target; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted:track_procs called for proc %s state %s", @@ -489,6 +492,14 @@ static void track_procs(int fd, short argc, void *cbdata) orte_state_base_check_fds(jdata); } + /* if ompi-server is around, then notify it to purge + * any session-related info */ + if (NULL != orte_data_server_uri) { + target.jobid = jdata->jobid; + target.vpid = ORTE_VPID_WILDCARD; + orte_state_base_notify_data_server(&target); + } + /* cleanup the job info */ opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL); OBJ_RELEASE(jdata); diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index d443ee4c688..32e7410609e 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -220,6 +220,7 @@ int pmix_server_init(void) return rc; } OBJ_CONSTRUCT(&orte_pmix_server_globals.notifications, opal_list_t); + orte_pmix_server_globals.server = *ORTE_NAME_INVALID; /* setup recv for direct modex requests */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX, diff --git a/orte/orted/pmix/pmix_server_internal.h b/orte/orted/pmix/pmix_server_internal.h index 52460271884..7046cc0a17f 100644 --- a/orte/orted/pmix/pmix_server_internal.h +++ b/orte/orted/pmix/pmix_server_internal.h @@ -45,8 +45,9 @@ #include "opal/util/proc.h" #include "orte/mca/grpcomm/base/base.h" +#include "orte/runtime/orte_globals.h" - BEGIN_C_DECLS +BEGIN_C_DECLS #define ORTED_PMIX_MIN_DMX_TIMEOUT 10 #define ORTE_ADJUST_TIMEOUT(a) \ @@ -252,7 +253,6 @@ typedef struct { opal_hotel_t reqs; int num_rooms; int timeout; - char *server_uri; bool wait_for_server; orte_process_name_t server; opal_list_t notifications; diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index f970b3b5909..4f44799979a 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -69,7 +69,7 @@ static int init_server(void) if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, - orte_basename, orte_pmix_server_globals.server_uri); + orte_basename, orte_data_server_uri); return ORTE_ERR_BAD_PARAM; } ++filename; /* space past the : */ @@ -77,7 +77,7 @@ static int init_server(void) if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, - orte_basename, orte_pmix_server_globals.server_uri); + orte_basename, orte_data_server_uri); return ORTE_ERR_BAD_PARAM; } @@ -85,14 +85,14 @@ static int init_server(void) fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, - orte_basename, orte_pmix_server_globals.server_uri); + orte_basename, orte_data_server_uri); return ORTE_ERR_BAD_PARAM; } if (NULL == fgets(input, 1024, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, - orte_basename, orte_pmix_server_globals.server_uri, + orte_basename, orte_data_server_uri, orte_basename); return ORTE_ERR_BAD_PARAM; } @@ -100,7 +100,7 @@ static int init_server(void) input[strlen(input)-1] = '\0'; /* remove newline */ server = strdup(input); } else { - server = strdup(orte_pmix_server_globals.server_uri); + server = strdup(orte_data_server_uri); } /* setup our route to the server */ OBJ_CONSTRUCT(&buf, opal_buffer_t); @@ -154,8 +154,8 @@ static void execute(int sd, short args, void *cbdata) /* we need to initialize our connection to the server */ if (ORTE_SUCCESS != (rc = init_server())) { orte_show_help("help-orted.txt", "noserver", true, - (NULL == orte_pmix_server_globals.server_uri) ? - "NULL" : orte_pmix_server_globals.server_uri); + (NULL == orte_data_server_uri) ? + "NULL" : orte_data_server_uri); goto callback; } } diff --git a/orte/runtime/orte_data_server.c b/orte/runtime/orte_data_server.c index 605b0acd077..e20eb26b814 100644 --- a/orte/runtime/orte_data_server.c +++ b/orte/runtime/orte_data_server.c @@ -653,6 +653,46 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ANSWER; break; + case ORTE_PMIX_PURGE_PROC_CMD: + /* unpack the proc whose data is to be purged - session + * data is purged by providing a requestor whose rank + * is wildcard */ + count = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &requestor, &count, OPAL_NAME))) { + ORTE_ERROR_LOG(rc); + goto SEND_ERROR; + } + + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, + "%s data server: purge data from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&requestor))); + + /* cycle across the stored data, looking for a match */ + for (k=0; k < orte_data_server_store.size; k++) { + data = (orte_data_object_t*)opal_pointer_array_get_item(&orte_data_server_store, k); + if (NULL == data) { + continue; + } + /* check if data posted by the same process */ + if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &data->owner, &requestor)) { + continue; + } + /* check persistence - if it is intended to persist beyond the + * proc itself, then we only delete it if rank=wildcard*/ + if ((data->persistence == OPAL_PMIX_PERSIST_APP || + data->persistence == OPAL_PMIX_PERSIST_SESSION) && + ORTE_VPID_WILDCARD != requestor.vpid) { + continue; + } + /* remove the object */ + opal_pointer_array_set_item(&orte_data_server_store, k, NULL); + OBJ_RELEASE(data); + } + /* no response is required */ + OBJ_RELEASE(answer); + return; + default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); rc = ORTE_ERR_BAD_PARAM; diff --git a/orte/runtime/orte_data_server.h b/orte/runtime/orte_data_server.h index 8981732445a..81eac536818 100644 --- a/orte/runtime/orte_data_server.h +++ b/orte/runtime/orte_data_server.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,10 +35,10 @@ BEGIN_C_DECLS -#define ORTE_PMIX_PUBLISH_CMD 0x01 -#define ORTE_PMIX_LOOKUP_CMD 0x02 -#define ORTE_PMIX_UNPUBLISH_CMD 0x03 - +#define ORTE_PMIX_PUBLISH_CMD 0x01 +#define ORTE_PMIX_LOOKUP_CMD 0x02 +#define ORTE_PMIX_UNPUBLISH_CMD 0x03 +#define ORTE_PMIX_PURGE_PROC_CMD 0x04 /* provide hooks to startup and finalize the data server */ ORTE_DECLSPEC int orte_data_server_init(void); From b5f470e9c3825612b5432e9109716a536858b004 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 30 May 2017 11:42:42 -0700 Subject: [PATCH 24/29] Fix the DVM Signed-off-by: Ralph Castain (cherry picked from commit ad108ba44d9d79c8f6c189dafd16daa336dc9a1b) --- orte/mca/odls/base/odls_base_default_fns.c | 3 +-- orte/mca/rmaps/round_robin/rmaps_rr_mappers.c | 2 +- orte/mca/state/dvm/state_dvm.c | 6 +++++- orte/orted/orted_submit.c | 1 - 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 8ce47c18e3b..6e7b7be5051 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -485,8 +485,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } } - if (!ORTE_PROC_IS_HNP && - !orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { /* compute and save bindings of local children */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index c0b08e2a033..505e05b35e8 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -359,7 +359,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata, return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; - orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); } /* not all nodes are equal, so only set oversubscribed for * this node if it is in that state diff --git a/orte/mca/state/dvm/state_dvm.c b/orte/mca/state/dvm/state_dvm.c index d095813594f..df74280669c 100644 --- a/orte/mca/state/dvm/state_dvm.c +++ b/orte/mca/state/dvm/state_dvm.c @@ -80,6 +80,8 @@ static orte_job_state_t launch_states[] = { ORTE_JOB_STATE_DAEMONS_LAUNCHED, ORTE_JOB_STATE_DAEMONS_REPORTED, ORTE_JOB_STATE_VM_READY, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_MAP_COMPLETE, ORTE_JOB_STATE_SYSTEM_PREP, ORTE_JOB_STATE_LAUNCH_APPS, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, @@ -98,6 +100,8 @@ static orte_state_cbfunc_t launch_callbacks[] = { orte_plm_base_daemons_launched, orte_plm_base_daemons_reported, vm_ready, + orte_rmaps_base_map_job, + orte_plm_base_mapping_complete, orte_plm_base_complete_setup, orte_plm_base_launch_apps, orte_state_base_local_launch_complete, @@ -211,7 +215,7 @@ static void files_ready(int status, void *cbdata) ORTE_FORCED_TERMINATE(status); return; } else { - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } } diff --git a/orte/orted/orted_submit.c b/orte/orted/orted_submit.c index 80090731766..41a20d88062 100644 --- a/orte/orted/orted_submit.c +++ b/orte/orted/orted_submit.c @@ -370,7 +370,6 @@ int orte_submit_init(int argc, char *argv[], } else { orte_process_info.proc_type = ORTE_PROC_TOOL; } - if (ORTE_PROC_IS_TOOL) { if (0 == strncasecmp(orte_cmd_options.hnp, "file", strlen("file"))) { char input[1024], *filename; From 2d5e03065b900cca5450fefdff6c1e8f3e4e0157 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 30 May 2017 14:07:22 -0700 Subject: [PATCH 25/29] Fix cwd and preload-binary options Signed-off-by: Ralph Castain (cherry picked from commit 321abfc8c6cf310ed852295d9189dbe43c9d2203) --- orte/mca/odls/base/odls_base_default_fns.c | 80 ++++++++++++---------- orte/mca/schizo/ompi/schizo_ompi.c | 5 ++ orte/orted/orted_submit.c | 13 ++-- 3 files changed, 51 insertions(+), 47 deletions(-) diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 6e7b7be5051..bd7ed13bd4c 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -534,11 +534,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, static int setup_path(orte_app_context_t *app, char **wdir) { - int rc; + int rc=ORTE_SUCCESS; char dir[MAXPATHLEN]; - char **argvptr; - char *pathenv = NULL, *mpiexec_pathenv = NULL; - char *full_search; if (!orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) { /* Try to change to the app's cwd and check that the app @@ -572,40 +569,6 @@ static int setup_path(orte_app_context_t *app, char **wdir) *wdir = NULL; } - /* Search for the OMPI_exec_path and PATH settings in the environment. */ - for (argvptr = app->env; *argvptr != NULL; argvptr++) { - if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) { - mpiexec_pathenv = *argvptr + 15; - } - if (0 == strncmp("PATH=", *argvptr, 5)) { - pathenv = *argvptr + 5; - } - } - - /* If OMPI_exec_path is set (meaning --path was used), then create a - temporary environment to be used in the search for the executable. - The PATH setting in this temporary environment is a combination of - the OMPI_exec_path and PATH values. If OMPI_exec_path is not set, - then just use existing environment with PATH in it. */ - if (NULL != mpiexec_pathenv) { - argvptr = NULL; - if (pathenv != NULL) { - asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv); - } else { - asprintf(&full_search, "%s", mpiexec_pathenv); - } - opal_setenv("PATH", full_search, true, &argvptr); - free(full_search); - } else { - argvptr = app->env; - } - - rc = orte_util_check_context_app(app, argvptr); - /* do not ERROR_LOG - it will be reported elsewhere */ - if (NULL != mpiexec_pathenv) { - opal_argv_free(argvptr); - } - CLEANUP: return rc; } @@ -662,6 +625,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) int rc, i; bool found; orte_proc_state_t state; + char **argvptr; + char *pathenv = NULL, *mpiexec_pathenv = NULL; + char *full_search; /* thread-protect common values */ cd->env = opal_argv_copy(app->env); @@ -762,6 +728,44 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) goto errorout; } + /* Search for the OMPI_exec_path and PATH settings in the environment. */ + for (argvptr = app->env; *argvptr != NULL; argvptr++) { + if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) { + mpiexec_pathenv = *argvptr + 15; + } + if (0 == strncmp("PATH=", *argvptr, 5)) { + pathenv = *argvptr + 5; + } + } + + /* If OMPI_exec_path is set (meaning --path was used), then create a + temporary environment to be used in the search for the executable. + The PATH setting in this temporary environment is a combination of + the OMPI_exec_path and PATH values. If OMPI_exec_path is not set, + then just use existing environment with PATH in it. */ + if (NULL != mpiexec_pathenv) { + argvptr = NULL; + if (pathenv != NULL) { + asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv); + } else { + asprintf(&full_search, "%s", mpiexec_pathenv); + } + opal_setenv("PATH", full_search, true, &argvptr); + free(full_search); + } else { + argvptr = app->env; + } + + rc = orte_util_check_context_app(app, argvptr); + /* do not ERROR_LOG - it will be reported elsewhere */ + if (NULL != mpiexec_pathenv) { + opal_argv_free(argvptr); + } + if (ORTE_SUCCESS != rc) { + state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; + goto errorout; + } + /* if we are indexing the argv by rank, do so now */ if (cd->index_argv && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { char *param; diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index b0e77f37cb1..e01198a7c97 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -1207,6 +1207,11 @@ static int setup_child(orte_job_t *jdata, opal_setenv("PWD", param, true, env); /* update the initial wdir value too */ opal_setenv("OMPI_MCA_initial_wdir", param, true, env); + } else if (NULL != app->cwd) { + /* change to it */ + if (0 != chdir(app->cwd)) { + return ORTE_ERROR; + } } return ORTE_SUCCESS; } diff --git a/orte/orted/orted_submit.c b/orte/orted/orted_submit.c index 41a20d88062..07511cbaf74 100644 --- a/orte/orted/orted_submit.c +++ b/orte/orted/orted_submit.c @@ -1628,22 +1628,17 @@ static int create_app(int argc, char* argv[], app->num_procs = (orte_std_cntr_t)orte_cmd_options.num_procs; total_num_apps++; - /* Capture any preload flags */ - if (orte_cmd_options.preload_binaries) { - orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } - /* if we were told to cwd to the session dir and the app was given in - * relative syntax, then we need to preload the binary to + /* see if we need to preload the binary to * find the app - don't do this for java apps, however, as we * can't easily find the class on the cmd line. Java apps have to * preload their binary via the preload_files option */ - if (!opal_path_is_absolute(app->argv[0]) && - NULL == strstr(app->argv[0], "java")) { + if (NULL == strstr(app->argv[0], "java")) { if (orte_cmd_options.preload_binaries) { orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - } else if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) { orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + /* no harm in setting this attribute twice as the function will simply ignore it */ + orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); } } if (NULL != orte_cmd_options.preload_files) { From fe5d92ae2198c96b7627ee066636cf1b8bf6d4a0 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 30 May 2017 15:58:55 -0700 Subject: [PATCH 26/29] Reorg ordering so that bare executable names also are found Signed-off-by: Ralph Castain (cherry picked from commit 5d990b557cbe7d45eec7fe3e6c853f0e75416078) --- orte/mca/odls/base/odls_base_default_fns.c | 95 +++++++++++----------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index bd7ed13bd4c..a243157a4a3 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -659,6 +659,54 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) child->rml_uri = NULL; } + /* setup the rest of the environment with the proc-specific items - these + * will be overwritten for each child + */ + if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) { + ORTE_ERROR_LOG(rc); + state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; + goto errorout; + } + + /* Search for the OMPI_exec_path and PATH settings in the environment. */ + for (argvptr = app->env; *argvptr != NULL; argvptr++) { + if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) { + mpiexec_pathenv = *argvptr + 15; + } + if (0 == strncmp("PATH=", *argvptr, 5)) { + pathenv = *argvptr + 5; + } + } + + /* If OMPI_exec_path is set (meaning --path was used), then create a + temporary environment to be used in the search for the executable. + The PATH setting in this temporary environment is a combination of + the OMPI_exec_path and PATH values. If OMPI_exec_path is not set, + then just use existing environment with PATH in it. */ + if (NULL != mpiexec_pathenv) { + argvptr = NULL; + if (pathenv != NULL) { + asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv); + } else { + asprintf(&full_search, "%s", mpiexec_pathenv); + } + opal_setenv("PATH", full_search, true, &argvptr); + free(full_search); + } else { + argvptr = app->env; + } + + rc = orte_util_check_context_app(app, argvptr); + /* do not ERROR_LOG - it will be reported elsewhere */ + if (NULL != mpiexec_pathenv) { + opal_argv_free(argvptr); + } + if (ORTE_SUCCESS != rc) { + opal_output(0, "%s:%d", __FILE__, __LINE__); + state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; + goto errorout; + } + /* did the user request we display output in xterms? */ if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { opal_list_item_t *nmitem; @@ -719,53 +767,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) cd->argv = opal_argv_copy(app->argv); } - /* setup the rest of the environment with the proc-specific items - these - * will be overwritten for each child - */ - if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) { - ORTE_ERROR_LOG(rc); - state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; - goto errorout; - } - - /* Search for the OMPI_exec_path and PATH settings in the environment. */ - for (argvptr = app->env; *argvptr != NULL; argvptr++) { - if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) { - mpiexec_pathenv = *argvptr + 15; - } - if (0 == strncmp("PATH=", *argvptr, 5)) { - pathenv = *argvptr + 5; - } - } - - /* If OMPI_exec_path is set (meaning --path was used), then create a - temporary environment to be used in the search for the executable. - The PATH setting in this temporary environment is a combination of - the OMPI_exec_path and PATH values. If OMPI_exec_path is not set, - then just use existing environment with PATH in it. */ - if (NULL != mpiexec_pathenv) { - argvptr = NULL; - if (pathenv != NULL) { - asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv); - } else { - asprintf(&full_search, "%s", mpiexec_pathenv); - } - opal_setenv("PATH", full_search, true, &argvptr); - free(full_search); - } else { - argvptr = app->env; - } - - rc = orte_util_check_context_app(app, argvptr); - /* do not ERROR_LOG - it will be reported elsewhere */ - if (NULL != mpiexec_pathenv) { - opal_argv_free(argvptr); - } - if (ORTE_SUCCESS != rc) { - state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; - goto errorout; - } - /* if we are indexing the argv by rank, do so now */ if (cd->index_argv && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { char *param; From f94b57b6e30d009eea53a64e5383e35a64417867 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 30 May 2017 20:37:26 -0700 Subject: [PATCH 27/29] Don't sweat the "sync" settings on file descriptors as those flags aren't apparently fully portable Signed-off-by: Ralph Castain (cherry picked from commit 26e7515a5e1c976b56d8606de7af981d4b0b0858) --- orte/mca/state/base/state_base_fns.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index 298e77b0379..dc4de766730 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -1148,15 +1148,6 @@ void orte_state_base_check_fds(orte_job_t *jdata) } else { opal_argv_append_nosize(&list, "rdwr"); } - if (flflags & O_DSYNC) { - opal_argv_append_nosize(&list, "dsync"); - } - if (flflags & O_RSYNC) { - opal_argv_append_nosize(&list, "rsync"); - } - if (flflags & O_SYNC) { - opal_argv_append_nosize(&list, "sync"); - } if (flk && F_UNLCK != fl.l_type) { if (F_WRLCK == fl.l_type) { opal_argv_append_nosize(&list, "wrlock"); From ddca65afbeddb5b7f6c5d4d73454e5dce82ee518 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 30 May 2017 21:42:42 -0700 Subject: [PATCH 28/29] Update PMIx to new release state Signed-off-by: Ralph Castain --- opal/mca/pmix/pmix2x/pmix/VERSION | 4 +- opal/mca/pmix/pmix2x/pmix/autogen.pl | 4 +- .../pmix/pmix2x/pmix/config/pmix_setup_cc.m4 | 7 +- opal/mca/pmix/pmix2x/pmix/configure.ac | 10 +- .../pmix/src/event/pmix_event_notification.c | 187 +++++++++++++----- .../mca/pmix/pmix2x/pmix/test/simple/simpft.c | 15 +- .../pmix/pmix2x/pmix/test/simple/simptest.c | 2 +- 7 files changed, 155 insertions(+), 74 deletions(-) diff --git a/opal/mca/pmix/pmix2x/pmix/VERSION b/opal/mca/pmix/pmix2x/pmix/VERSION index 727df5f26ac..c6d9bba4cca 100644 --- a/opal/mca/pmix/pmix2x/pmix/VERSION +++ b/opal/mca/pmix/pmix2x/pmix/VERSION @@ -30,7 +30,7 @@ greek= # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=git1ce71dd +repo_rev=gitd5e4801 # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="May 28, 2017" +date="May 30, 2017" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix2x/pmix/autogen.pl b/opal/mca/pmix/pmix2x/pmix/autogen.pl index e8aa569bc94..2f86eaf9613 100755 --- a/opal/mca/pmix/pmix2x/pmix/autogen.pl +++ b/opal/mca/pmix/pmix2x/pmix/autogen.pl @@ -55,9 +55,9 @@ my $exclude_list; # Minimum versions -my $pmix_automake_version = "1.15.0"; +my $pmix_automake_version = "1.13.4"; my $pmix_autoconf_version = "2.69"; -my $pmix_libtool_version = "2.4.6"; +my $pmix_libtool_version = "2.4.2"; # Search paths my $pmix_autoconf_search = "autoconf"; diff --git a/opal/mca/pmix/pmix2x/pmix/config/pmix_setup_cc.m4 b/opal/mca/pmix/pmix2x/pmix/config/pmix_setup_cc.m4 index b117fbf7fdb..3029ffa5266 100644 --- a/opal/mca/pmix/pmix2x/pmix/config/pmix_setup_cc.m4 +++ b/opal/mca/pmix/pmix2x/pmix/config/pmix_setup_cc.m4 @@ -12,11 +12,11 @@ dnl Copyright (c) 2004-2006 The Regents of the University of California. dnl All rights reserved. dnl Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. dnl Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. -dnl Copyright (c) 2012 Los Alamos National Security, LLC. All rights +dnl Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights dnl reserved. dnl Copyright (c) 2015 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. -dnl Copyright (c) 2015-2016 Intel, Inc. All rights reserved. +dnl Copyright (c) 2015-2017 Intel, Inc. All rights reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -38,7 +38,6 @@ AC_DEFUN([PMIX_SETUP_CC],[ AC_REQUIRE([AM_PROG_CC_C_O]) # We require a C99 compiant compiler - AC_PROG_CC_C99 # The result of AC_PROG_CC_C99 is stored in ac_cv_prog_cc_c99 if test "x$ac_cv_prog_cc_c99" = xno ; then AC_MSG_WARN([PMIx requires a C99 compiler]) @@ -322,7 +321,7 @@ AC_DEFUN([_PMIX_PROG_CC],[ # PMIX_VAR_SCOPE_PUSH([pmix_cflags_save dummy pmix_cc_arvgv0]) pmix_cflags_save="$CFLAGS" - AC_PROG_CC + AC_PROG_CC_C99 BASECC="`basename $CC`" CFLAGS="$pmix_cflags_save" AC_DEFINE_UNQUOTED(PMIX_CC, "$CC", [PMIx underlying C compiler]) diff --git a/opal/mca/pmix/pmix2x/pmix/configure.ac b/opal/mca/pmix/pmix2x/pmix/configure.ac index 99554efcb16..f8abb60d55b 100644 --- a/opal/mca/pmix/pmix2x/pmix/configure.ac +++ b/opal/mca/pmix/pmix2x/pmix/configure.ac @@ -12,7 +12,7 @@ # All rights reserved. # Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2006-2008 Sun Microsystems, Inc. All rights reserved. -# Copyright (c) 2006-2011 Los Alamos National Security, LLC. All rights +# Copyright (c) 2006-2017 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. # Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved. @@ -63,9 +63,9 @@ pmix_show_title "Configuring PMIx" AC_CANONICAL_TARGET # Init automake -AM_INIT_AUTOMAKE([foreign dist-bzip2 subdir-objects no-define 1.12.2 -Wall -Werror]) +AM_INIT_AUTOMAKE([foreign dist-bzip2 subdir-objects no-define 1.13.4 -Wall -Werror]) -# SILENT_RULES is new in AM 1.11, but we require 1.11 or higher via +# SILENT_RULES is new in AM 1.11, but we require 1.13.4 or higher via # autogen. Limited testing shows that calling SILENT_RULES directly # works in more cases than adding "silent-rules" to INIT_AUTOMAKE # (even though they're supposed to be identical). Shrug. @@ -179,10 +179,6 @@ PMIX_DO_AM_CONDITIONALS # Setup C compiler #################################################################### -CFLAGS_save="$CFLAGS" -AC_PROG_CC -CFLAGS="$CFLAGS_save" - AC_ARG_VAR(CC_FOR_BUILD,[build system C compiler]) AS_IF([test -z "$CC_FOR_BUILD"],[ AC_SUBST([CC_FOR_BUILD], [$CC]) diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c index 159100666f6..426063dcef3 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c @@ -157,10 +157,10 @@ static pmix_status_t notify_server_of_event(pmix_status_t status, PMIX_INFO_XFER(&chain->info[n], &info[n]); } } - /* put the evhandler name tag in the next-to-last element - we + /* add the evhandler name tag - we * will fill it in as each handler is called */ PMIX_INFO_LOAD(&chain->info[chain->ninfo-2], PMIX_EVENT_HDLR_NAME, NULL, PMIX_STRING); - /* now put the callback object tag in the last element */ + /* now add the callback object tag */ PMIX_INFO_LOAD(&chain->info[chain->ninfo-1], PMIX_EVENT_RETURN_OBJECT, NULL, PMIX_POINTER); /* we need to cache this event so we can pass it into @@ -324,15 +324,27 @@ static void progress_local_event_hdlr(pmix_status_t status, if (nxt->codes[0] == chain->status && check_range(&nxt->rng, &chain->source)) { chain->evhdlr = nxt; - /* add the handler name in case they want to reference it */ - if (NULL != chain->info[chain->ninfo-2].value.data.string) { - free(chain->info[chain->ninfo-2].value.data.string); + /* update the handler name in case they want to reference it */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { + if (NULL != chain->info[n].value.data.string) { + free(chain->info[n].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.string = strdup(chain->evhdlr->name); + } + break; + } } - if (NULL != chain->evhdlr->name) { - chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + /* update the evhdlr cbobject */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_RETURN_OBJECT, PMIX_MAX_KEYLEN)) { + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.ptr = chain->evhdlr->cbobject; + } + break; + } } - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; nxt->evhdlr(nxt->index, chain->status, &chain->source, chain->info, chain->ninfo, @@ -364,15 +376,27 @@ static void progress_local_event_hdlr(pmix_status_t status, * the source fits within it */ if (nxt->codes[n] == chain->status) { chain->evhdlr = nxt; - /* add the handler name in case they want to reference it */ - if (NULL != chain->info[chain->ninfo-2].value.data.string) { - free(chain->info[chain->ninfo-2].value.data.string); + /* update the handler name in case they want to reference it */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { + if (NULL != chain->info[n].value.data.string) { + free(chain->info[n].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.string = strdup(chain->evhdlr->name); + } + break; + } } - if (NULL != chain->evhdlr->name) { - chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + /* update the evhdlr cbobject */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_RETURN_OBJECT, PMIX_MAX_KEYLEN)) { + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.ptr = chain->evhdlr->cbobject; + } + break; + } } - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; nxt->evhdlr(nxt->index, chain->status, &chain->source, chain->info, chain->ninfo, @@ -398,15 +422,27 @@ static void progress_local_event_hdlr(pmix_status_t status, * the source fits within it */ if (check_range(&nxt->rng, &chain->source)) { chain->evhdlr = nxt; - /* add the handler name in case they want to reference it */ - if (NULL != chain->info[chain->ninfo-2].value.data.string) { - free(chain->info[chain->ninfo-2].value.data.string); + /* update the handler name in case they want to reference it */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { + if (NULL != chain->info[n].value.data.string) { + free(chain->info[n].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.string = strdup(chain->evhdlr->name); + } + break; + } } - if (NULL != chain->evhdlr->name) { - chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + /* update the evhdlr cbobject */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_RETURN_OBJECT, PMIX_MAX_KEYLEN)) { + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.ptr = chain->evhdlr->cbobject; + } + break; + } } - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = nxt->cbobject; nxt->evhdlr(nxt->index, chain->status, &chain->source, chain->info, chain->ninfo, @@ -425,15 +461,27 @@ static void progress_local_event_hdlr(pmix_status_t status, if (1 == pmix_globals.events.last->ncodes && pmix_globals.events.last->codes[0] == chain->status) { chain->evhdlr = pmix_globals.events.last; - /* add the handler name in case they want to reference it */ - if (NULL != chain->info[chain->ninfo-2].value.data.string) { - free(chain->info[chain->ninfo-2].value.data.string); + /* update the handler name in case they want to reference it */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { + if (NULL != chain->info[n].value.data.string) { + free(chain->info[n].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.string = strdup(chain->evhdlr->name); + } + break; + } } - if (NULL != chain->evhdlr->name) { - chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + /* update the evhdlr cbobject */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_RETURN_OBJECT, PMIX_MAX_KEYLEN)) { + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.ptr = chain->evhdlr->cbobject; + } + break; + } } - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; chain->evhdlr->evhdlr(chain->evhdlr->index, chain->status, &chain->source, chain->info, chain->ninfo, @@ -445,15 +493,27 @@ static void progress_local_event_hdlr(pmix_status_t status, for (n=0; n < pmix_globals.events.last->ncodes; n++) { if (pmix_globals.events.last->codes[n] == chain->status) { chain->evhdlr = pmix_globals.events.last; - /* add the handler name in case they want to reference it */ - if (NULL != chain->info[chain->ninfo-2].value.data.string) { - free(chain->info[chain->ninfo-2].value.data.string); + /* update the handler name in case they want to reference it */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { + if (NULL != chain->info[n].value.data.string) { + free(chain->info[n].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.string = strdup(chain->evhdlr->name); + } + break; + } } - if (NULL != chain->evhdlr->name) { - chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + /* update the evhdlr cbobject */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_RETURN_OBJECT, PMIX_MAX_KEYLEN)) { + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.ptr = chain->evhdlr->cbobject; + } + break; + } } - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; chain->evhdlr->evhdlr(chain->evhdlr->index, chain->status, &chain->source, chain->info, chain->ninfo, @@ -465,15 +525,27 @@ static void progress_local_event_hdlr(pmix_status_t status, } else { /* gets run for all codes */ chain->evhdlr = pmix_globals.events.last; - /* add the handler name in case they want to reference it */ - if (NULL != chain->info[chain->ninfo-2].value.data.string) { - free(chain->info[chain->ninfo-2].value.data.string); + /* update the handler name in case they want to reference it */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { + if (NULL != chain->info[n].value.data.string) { + free(chain->info[n].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.string = strdup(chain->evhdlr->name); + } + break; + } } - if (NULL != chain->evhdlr->name) { - chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + /* update the evhdlr cbobject */ + for (n=0; n < chain->ninfo; n++) { + if (0 == strncmp(chain->info[n].key, PMIX_EVENT_RETURN_OBJECT, PMIX_MAX_KEYLEN)) { + if (NULL != chain->evhdlr->name) { + chain->info[n].value.data.ptr = chain->evhdlr->cbobject; + } + break; + } } - /* add any cbobject - the info struct for it is at the end */ - chain->info[chain->ninfo-1].value.data.ptr = pmix_globals.events.last->cbobject; chain->evhdlr->evhdlr(chain->evhdlr->index, chain->status, &chain->source, chain->info, chain->ninfo, @@ -642,15 +714,28 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain) invk: - /* invoke the handler */ - /* add the handler name in case they want to reference it */ - if (NULL != chain->info[chain->ninfo-2].value.data.string) { - free(chain->info[chain->ninfo-2].value.data.string); + /* update the handler name in case they want to reference it */ + for (i=0; i < chain->ninfo; i++) { + if (0 == strncmp(chain->info[i].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { + if (NULL != chain->info[i].value.data.string) { + free(chain->info[i].value.data.string); + } + if (NULL != chain->evhdlr->name) { + chain->info[i].value.data.string = strdup(chain->evhdlr->name); + } + break; + } } - if (NULL != chain->evhdlr->name) { - chain->info[chain->ninfo-2].value.data.string = strdup(chain->evhdlr->name); + /* update the evhdlr cbobject */ + for (i=0; i < chain->ninfo; i++) { + if (0 == strncmp(chain->info[i].key, PMIX_EVENT_RETURN_OBJECT, PMIX_MAX_KEYLEN)) { + if (NULL != chain->evhdlr->name) { + chain->info[i].value.data.ptr = chain->evhdlr->cbobject; + } + break; + } } - chain->info[chain->ninfo-1].value.data.ptr = chain->evhdlr->cbobject; + /* invoke the handler */ pmix_output_verbose(2, pmix_globals.debug_output, "[%s:%d] INVOKING EVHDLR %s", __FILE__, __LINE__, (NULL == chain->evhdlr->name) ? diff --git a/opal/mca/pmix/pmix2x/pmix/test/simple/simpft.c b/opal/mca/pmix/pmix2x/pmix/test/simple/simpft.c index 61d006da4e2..0844b936e06 100644 --- a/opal/mca/pmix/pmix2x/pmix/test/simple/simpft.c +++ b/opal/mca/pmix/pmix2x/pmix/test/simple/simpft.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved. * $COPYRIGHT$ * @@ -114,13 +114,14 @@ int main(int argc, char **argv) PMIx_Abort(PMIX_ERR_OUT_OF_RESOURCE, "Eat rocks", &proc, 1); pmix_output(0, "Client ns %s rank %d: Abort called", myproc.nspace, myproc.rank); - } + } else { /* everyone simply waits */ - while (!completed) { - struct timespec ts; - ts.tv_sec = 0; - ts.tv_nsec = 100000; - nanosleep(&ts, NULL); + while (!completed) { + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = 100000; + nanosleep(&ts, NULL); + } } done: diff --git a/opal/mca/pmix/pmix2x/pmix/test/simple/simptest.c b/opal/mca/pmix/pmix2x/pmix/test/simple/simptest.c index 75969651faf..10b236a0c51 100644 --- a/opal/mca/pmix/pmix2x/pmix/test/simple/simptest.c +++ b/opal/mca/pmix/pmix2x/pmix/test/simple/simptest.c @@ -216,7 +216,7 @@ static void model_callback(size_t evhdlr_registration_id, /* just let us know it was received */ fprintf(stderr, "Model event handler called with status %d(%s)\n", status, PMIx_Error_string(status)); for (n=0; n < ninfo; n++) { - if (PMIX_STRING == info[n].value.type) { + if (0 == strncmp(info[n].key, PMIX_EVENT_HDLR_NAME, PMIX_MAX_KEYLEN)) { fprintf(stderr, "\t%s:\t%s\n", info[n].key, info[n].value.data.string); } } From 0ea1c67b42b8188b10abca31c64a4a2c56ed84e3 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 31 May 2017 07:38:37 -0700 Subject: [PATCH 29/29] Fix uninitialized variable. Set exit codes for failed launch so we get pretty error messages Signed-off-by: Ralph Castain (cherry picked from commit 9d6b929894714503b2b07d8319cf0e936bb342f9) --- opal/mca/pmix/base/pmix_base_fns.c | 46 ++++++++++++---------- orte/mca/odls/base/odls_base_default_fns.c | 2 +- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index d129cf1df0a..7dd6752d531 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -121,6 +121,7 @@ static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata) static void opcbfunc(int status, void *cbdata) { struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata; + cd->status = status; cd->active = false; } @@ -155,27 +156,29 @@ int opal_pmix_base_exchange(opal_value_t *indat, return rc; } } else { - caddy.active = true; - rc = opal_pmix.publish_nb(&ilist, opcbfunc, &caddy); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - OPAL_LIST_DESTRUCT(&ilist); - return rc; - } - while (caddy.active) { - usleep(10); - } - OPAL_LIST_DESTRUCT(&ilist); - if (OPAL_SUCCESS != caddy.status) { - OPAL_ERROR_LOG(caddy.status); - return caddy.status; - } - } - - /* lookup the other side's info - if a non-blocking form - * of lookup isn't available, then we use the blocking - * form and trust that the underlying system will WAIT - * until the other side publishes its data */ + caddy.status = -1; + caddy.active = true; + caddy.pdat = NULL; + rc = opal_pmix.publish_nb(&ilist, opcbfunc, &caddy); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + OPAL_LIST_DESTRUCT(&ilist); + return rc; + } + while (caddy.active) { + usleep(10); + } + OPAL_LIST_DESTRUCT(&ilist); + if (OPAL_SUCCESS != caddy.status) { + OPAL_ERROR_LOG(caddy.status); + return caddy.status; + } + } + + /* lookup the other side's info - if a non-blocking form + * of lookup isn't available, then we use the blocking + * form and trust that the underlying system will WAIT + * until the other side publishes its data */ pdat = OBJ_NEW(opal_pmix_pdata_t); pdat->value.key = strdup(outdat->value.key); pdat->value.type = outdat->value.type; @@ -214,6 +217,7 @@ int opal_pmix_base_exchange(opal_value_t *indat, return rc; } } else { + caddy.status = -1; caddy.active = true; caddy.pdat = pdat; keys = NULL; diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index a243157a4a3..932980d3e15 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -702,7 +702,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) opal_argv_free(argvptr); } if (ORTE_SUCCESS != rc) { - opal_output(0, "%s:%d", __FILE__, __LINE__); state = ORTE_PROC_STATE_FAILED_TO_LAUNCH; goto errorout; } @@ -798,6 +797,7 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata) errorout: ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); + child->exit_code = rc; ORTE_ACTIVATE_PROC_STATE(&child->name, state); OBJ_RELEASE(cd); }