diff --git a/opal/class/opal_hotel.h b/opal/class/opal_hotel.h index 8216d4cfd61..f8ecd4c0cb5 100644 --- a/opal/class/opal_hotel.h +++ b/opal/class/opal_hotel.h @@ -1,10 +1,11 @@ /* * Copyright (c) 2012-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -23,7 +24,7 @@ * * One use case for this class is for ACK-based network retransmission * schemes (NACK-based retransmission schemes probably can use - * opal_ring_buffer). + * opal_ring_buffer). * * For ACK-based retransmission schemes, a hotel might be used * something like this: @@ -61,7 +62,7 @@ BEGIN_C_DECLS struct opal_hotel_t; /* User-supplied function to be invoked when an occupant is evicted. */ -typedef void (*opal_hotel_eviction_callback_fn_t)(struct opal_hotel_t *hotel, +typedef void (*opal_hotel_eviction_callback_fn_t)(struct opal_hotel_t *hotel, int room_num, void *occupant); @@ -248,6 +249,55 @@ static inline void opal_hotel_checkout(opal_hotel_t *hotel, int room_num) assume the upper layer knows what it's doing. */ } +/** + * Check the specified occupant out of the hotel and return the occupant. + * + * @param hotel Pointer to hotel (IN) + * @param room Room number to checkout (IN) + * @param void * occupant (OUT) + * If there is an occupant in the room, their timer is canceled and + * they are checked out. + * + * Use this checkout and when caller needs the occupant + */ +static inline void opal_hotel_checkout_and_return_occupant(opal_hotel_t *hotel, int room_num, void **occupant) +{ + opal_hotel_room_t *room; + + /* Bozo check */ + assert(room_num < hotel->num_rooms); + + /* If there's an occupant in the room, check them out */ + room = &(hotel->rooms[room_num]); + if (OPAL_LIKELY(NULL != room->occupant)) { + opal_output (10, "checking out occupant %p from room num %d", room->occupant, room_num); + *occupant = room->occupant; + room->occupant = NULL; + opal_event_del(&(room->eviction_timer_event)); + hotel->last_unoccupied_room++; + assert(hotel->last_unoccupied_room < hotel->num_rooms); + hotel->unoccupied_rooms[hotel->last_unoccupied_room] = room_num; + } + else { + opal_output( 0, " OOPS there is no occupant in room_num %d", room_num); + } + +} + +/** + * Returns true if the hotel is empty (no occupant) + * @param hotel Pointer to hotel (IN) + * @return bool true if empty false if there is a occupant(s) + * + */ +static inline bool opal_hotel_is_empty (opal_hotel_t *hotel) +{ + if (hotel->last_unoccupied_room == hotel->num_rooms - 1) + return true; + else + return false; +} + /** * Destroy a hotel. * diff --git a/opal/class/opal_object.h b/opal/class/opal_object.h index 79470d586eb..02d9b17ada7 100644 --- a/opal/class/opal_object.h +++ b/opal/class/opal_object.h @@ -5,15 +5,13 @@ * Copyright (c) 2004-2007 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2014 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ - * * Additional copyrights may follow - * * $HEADER$ */ @@ -46,13 +44,13 @@ * OBJ_CLASS_DECLARATION(sally_t); * @endcode * All classes must have a parent which is also class. - * + * * In an implementation (.c) file, instantiate a class descriptor for * the class like this: * @code * OBJ_CLASS_INSTANCE(sally_t, parent_t, sally_construct, sally_destruct); * @endcode - * This macro actually expands to + * This macro actually expands to * @code * opal_class_t sally_t_class = { * "sally_t", @@ -240,7 +238,7 @@ struct opal_object_t { * constructor. * * @param type Type (class) of the object - * @return Pointer to the object + * @return Pointer to the object */ static inline opal_object_t *opal_obj_new(opal_class_t * cls); #if OPAL_ENABLE_DEBUG @@ -304,12 +302,14 @@ static inline opal_object_t *opal_obj_new_debug(opal_class_t* type, const char* * to NULL. * * @param object Pointer to the object + * + * */ #if OPAL_ENABLE_DEBUG #define OBJ_RELEASE(object) \ do { \ - assert(NULL != ((opal_object_t *) (object))->obj_class); \ assert(OPAL_OBJ_MAGIC_ID == ((opal_object_t *) (object))->obj_magic_id); \ + assert(NULL != ((opal_object_t *) (object))->obj_class); \ if (0 == opal_obj_update((opal_object_t *) (object), -1)) { \ OBJ_SET_MAGIC_ID((object), 0); \ opal_obj_run_destructors((opal_object_t *) (object)); \ @@ -457,7 +457,7 @@ static inline void opal_obj_run_destructors(opal_object_t * object) * * @param size Size of the object * @param cls Pointer to the class descriptor of this object - * @return Pointer to the object + * @return Pointer to the object */ static inline opal_object_t *opal_obj_new(opal_class_t * cls) { diff --git a/opal/util/timings.c b/opal/util/timings.c index 65817bf2692..6c0590acd76 100644 --- a/opal/util/timings.c +++ b/opal/util/timings.c @@ -201,7 +201,7 @@ static get_ts_t _init_timestamping(opal_timer_type_t type) } } -opal_timing_event_t *opal_timing_event_alloc(opal_timing_t *t) +static opal_timing_event_t *opal_timing_event_alloc(opal_timing_t *t) { if( t->buffer_offset >= t->buffer_size ){ // notch timings overhead diff --git a/orte/include/orte/constants.h b/orte/include/orte/constants.h index cc4bcdbcd31..d675a3499c0 100644 --- a/orte/include/orte/constants.h +++ b/orte/include/orte/constants.h @@ -11,6 +11,7 @@ * All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -62,7 +63,7 @@ enum { ORTE_ERR_UNPACK_INADEQUATE_SPACE = OPAL_ERR_UNPACK_INADEQUATE_SPACE, ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER = OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER, ORTE_ERR_TYPE_MISMATCH = OPAL_ERR_TYPE_MISMATCH, - ORTE_ERR_OPERATION_UNSUPPORTED = OPAL_ERR_OPERATION_UNSUPPORTED, + ORTE_ERR_OPERATION_UNSUPPORTED = OPAL_ERR_OPERATION_UNSUPPORTED, ORTE_ERR_UNKNOWN_DATA_TYPE = OPAL_ERR_UNKNOWN_DATA_TYPE, ORTE_ERR_BUFFER = OPAL_ERR_BUFFER, ORTE_ERR_DATA_TYPE_REDEF = OPAL_ERR_DATA_TYPE_REDEF, @@ -85,7 +86,7 @@ enum { ORTE_ERR_CONNECTION_FAILED = OPAL_ERR_CONNECTION_FAILED, ORTE_ERR_AUTHENTICATION_FAILED = OPAL_ERR_AUTHENTICATION_FAILED, ORTE_ERR_COMM_FAILURE = OPAL_ERR_COMM_FAILURE, - + /* error codes specific to ORTE - don't forget to update orte/util/error_strings.c when adding new error codes!! Otherwise, the error reporting system will potentially crash, @@ -133,7 +134,18 @@ enum { ORTE_ERR_SENSOR_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 42), ORTE_ERR_ALLOCATION_PENDING = (ORTE_ERR_BASE - 43), ORTE_ERR_NO_PATH_TO_TARGET = (ORTE_ERR_BASE - 44), - ORTE_ERR_OP_IN_PROGRESS = (ORTE_ERR_BASE - 45) + ORTE_ERR_OP_IN_PROGRESS = (ORTE_ERR_BASE - 45), + ORTE_ERR_OPEN_CHANNEL_PEER_FAIL = (ORTE_ERR_BASE - 46), + ORTE_ERR_OPEN_CHANNEL_PEER_REJECT = (ORTE_ERR_BASE - 47), + ORTE_ERR_QOS_TYPE_UNSUPPORTED = (ORTE_ERR_BASE - 48), + ORTE_ERR_QOS_ACK_WINDOW_FULL = (ORTE_ERR_BASE - 49), + ORTE_ERR_ACK_TIMEOUT_SENDER = (ORTE_ERR_BASE - 50), + ORTE_ERR_ACK_TIMEOUT_RECEIVER = (ORTE_ERR_BASE - 51), + ORTE_ERR_LOST_MSG_IN_WINDOW = (ORTE_ERR_BASE - 52), + ORTE_ERR_CHANNEL_BUSY = (ORTE_ERR_BASE - 53), + ORTE_ERR_DUPLICATE_MSG = (ORTE_ERR_BASE - 54), + ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 55), + ORTE_ERR_OPEN_CHANNEL_DUPLICATE = (ORTE_ERR_BASE - 56), }; #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 11c10a2db7a..b6b4068e8d9 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -5,20 +5,20 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -54,6 +54,7 @@ #include "orte/mca/grpcomm/base/base.h" #include "orte/mca/oob/base/base.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/qos/base/base.h" #include "orte/mca/odls/odls_types.h" #include "orte/mca/plm/plm.h" #include "orte/mca/filem/base/base.h" @@ -116,7 +117,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) /* get a separate orte event base */ orte_event_base = opal_start_progress_thread("orte", true); progress_thread_running = true; - /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -143,7 +143,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); - if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, @@ -152,13 +151,11 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_session_dir"; goto error; } - /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); - /* store the session directory location in the database */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_DSTORE_JOB_SDIR); @@ -187,7 +184,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) } OBJ_DESTRUCT(&kv); } - /* Setup the communication infrastructure */ /* * OOB Layer @@ -202,7 +198,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_oob_base_select"; goto error; } - /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -214,14 +209,23 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_rml_base_select"; goto error; } - + /* Messaging QoS Layer */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_qos_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_qos_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_qos_base_select"; + goto error; + } /* setup the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } - /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -233,7 +237,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_routed_base_select"; goto error; } - /* * Group communications */ @@ -247,7 +250,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_grpcomm_base_select"; goto error; } - /* non-daemon/HNP apps can only have the default proxy PLM * module open - provide a chance for it to initialize */ @@ -256,22 +258,18 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_plm_init"; goto error; } - /* enable communication via the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } - /* setup the routed info */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } - - #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC @@ -296,13 +294,11 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_sstore_base_select"; goto error; } - /* apps need the OPAL CR stuff */ opal_cr_set_enabled(true); #else opal_cr_set_enabled(false); #endif - /* Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. @@ -312,7 +308,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_cr_init"; goto error; } - /* open the distributed file system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -324,9 +319,7 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_dfs_base_select"; goto error; } - return ORTE_SUCCESS; - error: if (!progress_thread_running) { /* can't send the help message, so ensure it @@ -337,7 +330,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); - return ret; } @@ -375,7 +367,7 @@ int orte_ess_base_app_finalize(void) /* free the event base to cleanup memory */ opal_stop_progress_thread("orte", true); - return ORTE_SUCCESS; + return ORTE_SUCCESS; } /* @@ -405,16 +397,14 @@ void orte_ess_base_app_abort(int status, bool report) /* Exit - do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition - * that precludes normal cleanup + * that precludes normal cleanup * - * We do need to do the following bits to make sure we leave a + * We do need to do the following bits to make sure we leave a * clean environment. Taken from orte_finalize(): * - Assume errmgr cleans up child processes before we exit. */ - /* CRS cleanup since it may have a named pipe and thread active */ orte_cr_finalize(); - /* If we were asked to report this termination, do so. * Since singletons don't start an HNP unless necessary, and * direct-launched procs don't have daemons at all, only send @@ -430,11 +420,9 @@ void orte_ess_base_app_abort(int status, bool report) * have a chance to be sent */ nanosleep(&tp, NULL); } - - /* - Clean out the global structures + /* - Clean out the global structures * (not really necessary, but good practice) */ orte_proc_info_finalize(); - /* Now Exit */ _exit(status); } diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index c536631eecd..526c7732d19 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -5,7 +5,7 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. @@ -13,12 +13,12 @@ * et Automatique. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights - * reserved. + * reserved. * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -50,6 +50,7 @@ #include "orte/mca/routed/base/base.h" #include "orte/mca/routed/routed.h" #include "orte/mca/oob/base/base.h" +#include "orte/mca/qos/base/base.h" #include "orte/mca/dfs/base/base.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/grpcomm/base/base.h" @@ -123,26 +124,21 @@ int orte_ess_base_orted_setup(char **hosts) opal_proc_local_set(&orte_process_info.super); plm_in_use = false; - /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up - * after ourselves. + * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); - /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); - signals_set = true; - #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; - /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { @@ -152,7 +148,6 @@ int orte_ess_base_orted_setup(char **hosts) } /* generate the signature */ orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology); - /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So @@ -177,14 +172,12 @@ int orte_ess_base_orted_setup(char **hosts) break; } } - if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } } #endif - /* open and setup the opal_pstat framework so we can provide * process stats if requested */ @@ -198,7 +191,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "opal_pstat_base_select"; goto error; } - /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -210,20 +202,18 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_state_base_select"; goto error; } - /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } - /* some environments allow remote launches - e.g., ssh - so * open and select something -only- if we are given * a specific module to use */ (void) mca_base_var_env_name("plm", ¶m); - + plm_in_use = !!(getenv(param)); free (param); @@ -240,7 +230,6 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } } - /* setup my session directory here as the OOB may need it */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, @@ -248,7 +237,7 @@ int orte_ess_base_orted_setup(char **hosts) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); - + /* take a pass thru the session directory code to fillin the * tmpdir names - don't create anything yet */ @@ -264,7 +253,6 @@ int orte_ess_base_orted_setup(char **hosts) * stale directories laying around */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - /* now actually create the directory tree */ if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, @@ -274,18 +262,15 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_session_dir"; goto error; } - /* set the opal_output env file location to be in the * proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); - /* setup stdout/stderr */ if (orte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ - /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, ORTE_PROC_MY_NAME->jobid))) { @@ -293,7 +278,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "convert_jobid"; goto error; } - /* define a log file name in the session directory */ snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", jobidstring, orte_process_info.nodename); @@ -302,7 +286,7 @@ int orte_ess_base_orted_setup(char **hosts) orte_process_info.top_session_dir, log_file, NULL); - + fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so @@ -318,7 +302,6 @@ int orte_ess_base_orted_setup(char **hosts) } } } - /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -340,14 +323,23 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_rml_base_select"; goto error; } - + /* Messaging QoS Layer */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_qos_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_qos_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_qos_base_select"; + goto error; + } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } - /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -359,7 +351,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_routed_base_select"; goto error; } - /* * Group communications */ @@ -373,7 +364,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_grpcomm_base_select"; goto error; } - /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -385,7 +375,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_odls_base_select"; goto error; } - /* Open/select the rtc */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -397,14 +386,12 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_rtc_base_select"; goto error; } - /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } - #if ORTE_ENABLE_STATIC_PORTS /* if we are using static ports, then we need to setup * the daemon info so the RML can function properly @@ -417,7 +404,6 @@ int orte_ess_base_orted_setup(char **hosts) * if we are trying to setup common or static ports */ orte_routed.update_routing_plan(); - /* extract the node info from the environment and * build a nidmap from it */ @@ -433,7 +419,6 @@ int orte_ess_base_orted_setup(char **hosts) * need to do it anyway just to initialize things */ orte_routed.update_routing_plan(); - /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup @@ -448,7 +433,6 @@ int orte_ess_base_orted_setup(char **hosts) goto error; } } - /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, @@ -459,7 +443,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "setup job array"; goto error; } - orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, @@ -478,18 +461,15 @@ int orte_ess_base_orted_setup(char **hosts) error = "setup node topologies array"; goto error; } - - /* Setup the job data object for the daemons */ + /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_set_item(orte_job_data, 0, jdata); - /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; - /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); @@ -498,18 +478,15 @@ int orte_ess_base_orted_setup(char **hosts) /* point our topology to the one detected locally */ node->topology = opal_hwloc_topology; #endif - /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; - proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); - - /* record that the daemon (i.e., us) is on this node + /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the @@ -519,33 +496,28 @@ int orte_ess_base_orted_setup(char **hosts) node->daemon = proc; ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED); node->state = ORTE_NODE_STATE_UP; - /* now point our proc node field to the node */ OBJ_RETAIN(node); /* keep accounting straight */ proc->node = node; - /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; - /* setup the PMIx server */ if (ORTE_SUCCESS != (ret = pmix_server_init())) { ORTE_ERROR_LOG(ret); error = "pmix server init"; goto error; } - /* setup the routed info - the selected routed component - * will know what to do. + * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } - /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -557,7 +529,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_iof_base_select"; goto error; } - /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -569,7 +540,7 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_filem_base_select"; goto error; } - + #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC @@ -594,13 +565,12 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_sstore_base_select"; goto error; } - + /* For daemons, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif - /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. @@ -611,7 +581,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_cr_init"; goto error; } - /* setup the DFS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -623,7 +592,6 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_dfs_select"; goto error; } - /* setup the SCHIZO framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_schizo_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -635,14 +603,11 @@ int orte_ess_base_orted_setup(char **hosts) error = "orte_schizo_select"; goto error; } - return ORTE_SUCCESS; - error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); - return ORTE_ERR_SILENT; } @@ -656,15 +621,12 @@ int orte_ess_base_orted_finalize(void) opal_event_signal_del(&sigusr1_handler); opal_event_signal_del(&sigusr2_handler); } - /* cleanup */ if (NULL != log_path) { unlink(log_path); } - /* shutdown the pmix server */ pmix_server_finalize(); - /* close frameworks */ (void) mca_base_framework_close(&orte_schizo_base_framework); (void) mca_base_framework_close(&orte_filem_base_framework); @@ -672,10 +634,8 @@ int orte_ess_base_orted_finalize(void) (void) mca_base_framework_close(&orte_iof_base_framework); (void) mca_base_framework_close(&orte_errmgr_base_framework); (void) mca_base_framework_close(&orte_plm_base_framework); - /* close the dfs so its threads can exit */ (void) mca_base_framework_close(&orte_dfs_base_framework); - /* make sure our local procs are dead */ orte_odls.kill_local_procs(NULL); (void) mca_base_framework_close(&orte_rtc_base_framework); @@ -684,13 +644,10 @@ int orte_ess_base_orted_finalize(void) (void) mca_base_framework_close(&orte_rml_base_framework); (void) mca_base_framework_close(&orte_oob_base_framework); (void) mca_base_framework_close(&orte_state_base_framework); - (void) mca_base_framework_close(&opal_dstore_base_framework); - /* cleanup any lingering session directories */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - return ORTE_SUCCESS; + return ORTE_SUCCESS; } static void shutdown_signal(int fd, short flags, void *arg) diff --git a/orte/mca/ess/base/ess_base_std_tool.c b/orte/mca/ess/base/ess_base_std_tool.c index 1368b2ae3d6..8c8cefa7bee 100644 --- a/orte/mca/ess/base/ess_base_std_tool.c +++ b/orte/mca/ess/base/ess_base_std_tool.c @@ -5,7 +5,7 @@ * Copyright (c) 2004-2009 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. @@ -15,9 +15,9 @@ * Copyright (c) 2014 Hochschule Esslingen. All rights reserved. * * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -43,6 +43,7 @@ #include "orte/mca/oob/base/base.h" #include "orte/mca/plm/base/base.h" #include "orte/mca/rml/base/base.h" +#include "orte/mca/qos/base/base.h" #include "orte/mca/routed/base/base.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/iof/base/base.h" @@ -88,7 +89,6 @@ int orte_ess_base_tool_setup(void) progress_thread_running = true; orte_event_base_active = true; } - /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -100,7 +100,6 @@ int orte_ess_base_tool_setup(void) error = "orte_state_base_select"; goto error; } - /* open and setup the error manager */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -112,7 +111,6 @@ int orte_ess_base_tool_setup(void) error = "orte_errmgr_base_select"; goto error; } - /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -124,7 +122,6 @@ int orte_ess_base_tool_setup(void) error = "orte_oob_base_select"; goto error; } - /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -136,6 +133,17 @@ int orte_ess_base_tool_setup(void) error = "orte_rml_base_select"; goto error; } + /* Messaging QoS Layer */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_qos_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_qos_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_qos_base_select"; + goto error; + } /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -147,20 +155,17 @@ int orte_ess_base_tool_setup(void) error = "orte_routed_base_select"; goto error; } - /* since I am a tool, then all I really want to do is communicate. * So setup communications and be done - finding the HNP * to which I want to communicate and setting up a route for * that link is my responsibility */ - /* enable communication via the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } - /* we -may- need to know the name of the head * of our session directory tree, particularly the * tmp base where any other session directories on @@ -174,16 +179,16 @@ int orte_ess_base_tool_setup(void) error = "define session dir names"; goto error; } - + /* setup the routed info - the selected routed component - * will know what to do. + * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } - + /* setup I/O forwarding system - must come after we init routes */ if (NULL != orte_process_info.my_hnp_uri) { /* only do this if we were given an HNP */ @@ -207,7 +212,7 @@ int orte_ess_base_tool_setup(void) /* we don't select the plm framework as we only want the * base proxy functions */ } - + #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC @@ -233,7 +238,7 @@ int orte_ess_base_tool_setup(void) error = "orte_sstore_base_select"; goto error; } - + /* Tools do not need all the OPAL CR stuff */ opal_cr_set_enabled(false); #endif @@ -251,12 +256,12 @@ int orte_ess_base_tool_setup(void) } return ORTE_SUCCESS; - + error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); - + return ret; } @@ -286,5 +291,5 @@ int orte_ess_base_tool_finalize(void) opal_stop_progress_thread("orte", true); progress_thread_running = false; } - return ORTE_SUCCESS; + return ORTE_SUCCESS; } diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index cf973e3c84e..5914490974f 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -5,19 +5,19 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ * */ @@ -55,6 +55,7 @@ #include "orte/mca/oob/base/base.h" #include "orte/mca/rml/base/base.h" +#include "orte/mca/qos/base/base.h" #include "orte/mca/rml/rml_types.h" #include "orte/mca/routed/base/base.h" #include "orte/mca/routed/routed.h" @@ -231,7 +232,6 @@ static int rte_init(void) error = "opal_pstat_base_select"; goto error; } - /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -261,7 +261,6 @@ static int rte_init(void) error = "orte_plm_base_open"; goto error; } - if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; @@ -296,7 +295,6 @@ static int rte_init(void) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); - /* take a pass thru the session directory code to fillin the * tmpdir names - don't create anything yet */ @@ -325,7 +323,6 @@ static int rte_init(void) } /* Setup the communication infrastructure */ - /* * OOB Layer */ @@ -354,12 +351,23 @@ static int rte_init(void) goto error; } + /* Messaging QoS Layer */ + if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) { + ORTE_ERROR_LOG(ret); + error = "orte_qos_base_open"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_qos_base_select())) { + ORTE_ERROR_LOG(ret); + error = "orte_qos_base_select"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } - /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, @@ -370,7 +378,6 @@ static int rte_init(void) error = "setup job array"; goto error; } - orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, @@ -389,8 +396,7 @@ static int rte_init(void) error = "setup node topologies array"; goto error; } - - /* Setup the job data object for the daemons */ + /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; @@ -400,12 +406,11 @@ static int rte_init(void) * are running! */ jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED; - + /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; - /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); @@ -420,20 +425,17 @@ static int rte_init(void) opal_pointer_array_add(orte_node_topologies, t); } #endif - /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; - proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; OBJ_RETAIN(node); /* keep accounting straight */ proc->node = node; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); - - /* record that the daemon (i.e., us) is on this node + /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the @@ -443,7 +445,6 @@ static int rte_init(void) node->daemon = proc; ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED); node->state = ORTE_NODE_STATE_UP; - /* if we are to retain aliases, get ours */ if (orte_retain_aliases) { aliases = NULL; @@ -455,13 +456,11 @@ static int rte_init(void) orte_set_attribute(&node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, aptr, OPAL_STRING); free(aptr); } - /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; - /* * Routed system */ @@ -475,8 +474,6 @@ static int rte_init(void) error = "orte_routed_base_select"; goto error; } - - /* * Group communications */ @@ -490,7 +487,6 @@ static int rte_init(void) error = "orte_grpcomm_base_select"; goto error; } - /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup @@ -501,7 +497,6 @@ static int rte_init(void) error = "orte_plm_init"; goto error; } - /* * Setup the remaining resource * management and errmgr frameworks - application procs @@ -512,18 +507,17 @@ static int rte_init(void) ORTE_ERROR_LOG(ret); error = "orte_ras_base_open"; goto error; - } + } if (ORTE_SUCCESS != (ret = orte_ras_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_find_available"; goto error; } - if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_open"; goto error; - } + } if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_find_available"; @@ -576,7 +570,6 @@ static int rte_init(void) } } #endif - /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -588,7 +581,6 @@ static int rte_init(void) error = "orte_odls_base_select"; goto error; } - /* Open/select the rtc */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -600,21 +592,18 @@ static int rte_init(void) error = "orte_rtc_base_select"; goto error; } - /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } - /* we are an hnp, so update the contact info field for later use */ orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); proc->rml_uri = strdup(orte_process_info.my_hnp_uri); /* we are also officially a daemon, so better update that field too */ orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri); - /* setup the orte_show_help system to recv remote output */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP, ORTE_RML_PERSISTENT, orte_show_help_recv, NULL); @@ -624,17 +613,15 @@ static int rte_init(void) * proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); - /* save my contact info in a file for others to find */ jobfam_dir = opal_dirname(orte_process_info.job_session_dir); contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL); free(jobfam_dir); - OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_path)); - + if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file failed with error %s", @@ -647,23 +634,20 @@ static int rte_init(void) } free(contact_path); } - /* setup the PMIx server */ if (ORTE_SUCCESS != (ret = pmix_server_init())) { ORTE_ERROR_LOG(ret); error = "pmix server init"; goto error; } - /* setup the routed info - the selected routed component - * will know what to do. + * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } - /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -675,7 +659,6 @@ static int rte_init(void) error = "orte_iof_base_select"; goto error; } - /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -687,7 +670,6 @@ static int rte_init(void) error = "orte_filem_base_select"; goto error; } - #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC @@ -718,7 +700,6 @@ static int rte_init(void) #else opal_cr_set_enabled(false); #endif - /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. @@ -729,7 +710,6 @@ static int rte_init(void) error = "orte_cr_init"; goto error; } - /* setup the dfs framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -741,7 +721,6 @@ static int rte_init(void) error = "orte_dfs_select"; goto error; } - /* setup the schizo framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_schizo_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -753,7 +732,6 @@ static int rte_init(void) error = "orte_schizo_select"; goto error; } - /* if a tool has launched us and is requesting event reports, * then set its contact info into the comm system */ @@ -763,14 +741,12 @@ static int rte_init(void) goto error; } } - /* We actually do *not* want an HNP to voluntarily yield() the processor more than necessary. Orterun already blocks when it is doing nothing, so it doesn't use any more CPU cycles than it should; but when it *is* doing something, we do not want it to be unnecessarily delayed because it voluntarily yielded the processor in the middle of its work. - For example: when a message arrives at orterun, we want the OS to wake us up in a timely fashion (which most OS's seem good about doing) and then we want orterun to process @@ -783,7 +759,6 @@ static int rte_init(void) problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); - return ORTE_SUCCESS; error: @@ -792,7 +767,6 @@ static int rte_init(void) "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } - return ORTE_ERR_SILENT; } @@ -855,7 +829,7 @@ static int rte_finalize(void) /* ensure we scrub the session directory tree */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - + /* close the xml output file, if open */ if (orte_xml_output) { fprintf(orte_xml_fp, "\n"); @@ -864,7 +838,6 @@ static int rte_finalize(void) fclose(orte_xml_fp); } } - return ORTE_SUCCESS; } @@ -872,24 +845,21 @@ static void rte_abort(int status, bool report) { /* do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition - * that precludes normal cleanup + * that precludes normal cleanup * - * We do need to do the following bits to make sure we leave a + * We do need to do the following bits to make sure we leave a * clean environment. Taken from orte_finalize(): * - Assume errmgr cleans up child processes before we exit. */ - + /* CRS cleanup since it may have a named pipe and thread active */ orte_cr_finalize(); - /* ensure we scrub the session directory tree */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - - /* - Clean out the global structures + /* - Clean out the global structures * (not really necessary, but good practice) */ orte_proc_info_finalize(); - /* just exit */ exit(status); } @@ -903,13 +873,10 @@ static void clean_abort(int fd, short flags, void *arg) if (forcibly_die) { /* kill any local procs */ orte_odls.kill_local_procs(NULL); - /* whack any lingering session directory files from our jobs */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); - /* cleanup our data server */ orte_data_server_finalize(); - /* exit with a non-zero status */ exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } @@ -924,17 +891,14 @@ static void clean_abort(int fd, short flags, void *arg) /* ensure that the forwarding of stdin stops */ orte_job_term_ordered = true; - /* tell us to be quiet - hey, the user killed us with a ctrl-c, * so need to tell them that! */ orte_execute_quiet = true; - if (!orte_never_launched) { /* cleanup our data server */ orte_data_server_finalize(); } - /* We are in an event handler; the job completed procedure will delete the signal handler that is currently running (which is a Bad Thing), so we can't call it directly. diff --git a/orte/mca/oob/oob.h b/orte/mca/oob/oob.h index cc3778e892d..862860e48e5 100644 --- a/orte/mca/oob/oob.h +++ b/orte/mca/oob/oob.h @@ -44,7 +44,7 @@ #include "orte/mca/mca.h" #include "orte/mca/rml/base/base.h" - +#include "orte/mca/qos/base/base.h" BEGIN_C_DECLS typedef bool (*mca_oob_base_component_avail_fn_t)(void); diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index b2233d715a8..a8109787559 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -9,11 +9,11 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -238,9 +238,9 @@ static int parse_uri(const uint16_t af_family, #if OPAL_ENABLE_IPV6 else if (AF_INET6 == af_family) { struct sockaddr_in6 *in6; - memset(inaddr, 0, sizeof(struct sockaddr_in6)); + memset(inaddr, 0, sizeof(struct sockaddr_in6));= in6 = (struct sockaddr_in6*) inaddr; - + if (0 == inet_pton(AF_INET6, host, (void*)&in6->sin6_addr)) { opal_output (0, "oob_tcp_parse_uri: Could not convert %s\n", host); return ORTE_ERR_BAD_PARAM; @@ -250,8 +250,6 @@ static int parse_uri(const uint16_t af_family, else { return ORTE_ERR_NOT_SUPPORTED; } - - return ORTE_SUCCESS; } @@ -274,7 +272,7 @@ static void process_set_peer(int fd, short args, void *cbdata) if (AF_INET != pop->af_family) { opal_output_verbose(20, orte_oob_base_framework.framework_output, - "%s NOT AF_INET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + "%s NOT AF_INET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto cleanup; } @@ -401,10 +399,10 @@ static void process_send(int fd, short args, void *cbdata) orte_process_name_t hop; opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s:[%s:%d] processing send to peer %s:%d", + "%s:[%s:%d] processing send to peer %s:%d to channel =%d seq_num = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, - ORTE_NAME_PRINT(&op->msg->dst), op->msg->tag); + ORTE_NAME_PRINT(&op->msg->dst), op->msg->tag, op->msg->dst_channel, op->msg->seq_num); /* do we have a route to this peer (could be direct)? */ hop = orte_routed.get_route(&op->msg->dst); @@ -546,7 +544,7 @@ static void resend(struct mca_oob_tcp_msg_error_t *mp) * socket to recv. This is called for the listen sockets to accept an * incoming connection, on new sockets trying to complete the software * connection process, and for probes. Data on an established - * connection is handled elsewhere. + * connection is handled elsewhere. */ static void recv_handler(int sd, short flg, void *cbdata) { @@ -583,7 +581,6 @@ static void recv_handler(int sd, short flg, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strerror(opal_socket_errno), opal_socket_errno); } } - /* is the peer instance willing to accept this connection */ peer->sd = sd; if (mca_oob_tcp_peer_accept(peer) == false) { diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 4bf063465e9..9716063c8d7 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -14,7 +14,7 @@ * reserved. * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * @@ -25,7 +25,7 @@ * In windows, many of the socket functions return an EWOULDBLOCK * instead of things like EAGAIN, EINPROGRESS, etc. It has been * verified that this will not conflict with other error codes that - * are returned by these functions under UNIX/Linux environments + * are returned by these functions under UNIX/Linux environments */ #include "orte_config.h" @@ -150,7 +150,7 @@ static int tcp_component_open(void) #endif /* if_include and if_exclude need to be mutually exclusive */ - if (OPAL_SUCCESS != + if (OPAL_SUCCESS != mca_base_var_check_exclusive("orte", mca_oob_tcp_component.super.oob_base.mca_type_name, mca_oob_tcp_component.super.oob_base.mca_component_name, @@ -162,7 +162,6 @@ static int tcp_component_open(void) "open" failing is not printed */ return ORTE_ERR_NOT_AVAILABLE; } - return ORTE_SUCCESS; } @@ -275,7 +274,7 @@ static int tcp_component_register(void) #if ORTE_ENABLE_STATIC_PORTS static_port_string = NULL; - (void)mca_base_component_var_register(component, "static_ipv4_ports", + (void)mca_base_component_var_register(component, "static_ipv4_ports", "Static ports for daemons and procs (IPv4)", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9, @@ -295,7 +294,7 @@ static int tcp_component_register(void) #if OPAL_ENABLE_IPV6 static_port_string6 = NULL; - (void)mca_base_component_var_register(component, "static_ipv6_ports", + (void)mca_base_component_var_register(component, "static_ipv6_ports", "Static ports for daemons and procs (IPv6)", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9, @@ -320,7 +319,6 @@ static int tcp_component_register(void) } #endif #endif - dyn_port_string = NULL; (void)mca_base_component_var_register(component, "dynamic_ipv4_ports", "Range of ports to be dynamically used by daemons and procs (IPv4)", @@ -406,7 +404,7 @@ static int tcp_component_register(void) &mca_oob_tcp_component.disable_ipv6_family); #endif - + mca_oob_tcp_component.keepalive_time = 10; (void)mca_base_component_var_register(component, "keepalive_time", "Idle time in seconds before starting to send keepalives (num <= 0 ----> disable keepalive)", @@ -487,7 +485,7 @@ static bool component_available(void) excluding = true; } - /* look at all available interfaces */ + /* look at all available interfaces */ for (i = opal_ifbegin(); i >= 0; i = opal_ifnext(i)) { if (OPAL_SUCCESS != opal_ifindextoaddr(i, (struct sockaddr*) &my_ss, sizeof (my_ss))) { @@ -495,7 +493,6 @@ static bool component_available(void) i, opal_ifindextokindex(i)); continue; } - /* ignore non-ip4/6 interfaces */ if (AF_INET != my_ss.ss_family #if OPAL_ENABLE_IPV6 @@ -504,7 +501,6 @@ static bool component_available(void) ) { continue; } - kindex = opal_ifindextokindex(i); if (kindex <= 0) { continue; @@ -562,7 +558,7 @@ static bool component_available(void) * IP interfaces that are "up" on the same subnet (because that's a Bad Idea). Note * that we should only check for this after applying the relevant include/exclude * list MCA params. If we detect redundant ports, we can also automatically ignore - * them so that applications won't hang. + * them so that applications won't hang. */ /* add this address to our connections */ @@ -638,7 +634,7 @@ static int component_startup(void) static void component_shutdown(void) { - int i=0; + int i = 0; opal_list_item_t *item; opal_output_verbose(2, orte_oob_base_framework.framework_output, @@ -665,9 +661,9 @@ static void component_shutdown(void) static int component_send(orte_rml_send_t *msg) { opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:tcp:send_nb to peer %s:%d", + "%s oob:tcp:send_nb to peer %s:%d to channel=%d seq = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&msg->dst), msg->tag); + ORTE_NAME_PRINT(&msg->dst), msg->tag,msg->dst_channel, msg->seq_num ); /* the module is potentially running on its own event * base, so all it can do is push our send request @@ -748,8 +744,8 @@ static int component_set_addr(orte_process_name_t *peer, tcpuri = strdup(uris[i]); if (NULL == tcpuri) { opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s oob:tcp: out of memory", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + "%s oob:tcp: out of memory", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); continue; } if (0 == strncmp(uris[i], "tcp:", 4)) { @@ -781,7 +777,6 @@ static int component_set_addr(orte_process_name_t *peer, "%s oob:tcp: working peer %s address %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer), uris[i]); - /* separate the ports from the network addrs */ ports = strrchr(tcpuri, ':'); *ports = '\0'; @@ -830,7 +825,7 @@ static int component_set_addr(orte_process_name_t *peer, } else { host = addrs[j]; } - + /* pass this proc, and its ports, to the * module for handling - this module will be responsible * for communicating with the proc via this network. @@ -1060,6 +1055,8 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata) snd->dst = mop->snd->hdr.dst; snd->origin = mop->snd->hdr.origin; snd->tag = mop->snd->hdr.tag; + snd->dst_channel = mop->snd->hdr.channel; + snd->seq_num = mop->snd->hdr.seq_num; snd->data = mop->snd->data; snd->count = mop->snd->hdr.nbytes; snd->cbfunc.iov = NULL; @@ -1137,7 +1134,7 @@ static char **split_and_resolve(char **orig_str, char *name) str = strchr(argv[i], '/'); if (NULL == str) { orte_show_help("help-oob-tcp.txt", "invalid if_inexclude", - true, name, orte_process_info.nodename, + true, name, orte_process_info.nodename, tmp, "Invalid specification (missing \"/\")"); free(argv[i]); free(tmp); @@ -1148,7 +1145,7 @@ static char **split_and_resolve(char **orig_str, char *name) /* Now convert the IPv4 address */ ((struct sockaddr*) &argv_inaddr)->sa_family = AF_INET; - ret = inet_pton(AF_INET, argv[i], + ret = inet_pton(AF_INET, argv[i], &((struct sockaddr_in*) &argv_inaddr)->sin_addr); free(argv[i]); @@ -1165,11 +1162,11 @@ static char **split_and_resolve(char **orig_str, char *name) name, opal_net_get_hostname((struct sockaddr*) &argv_inaddr), argv_prefix); - + /* Go through all interfaces and see if we can find a match */ for (if_index = opal_ifbegin(); if_index >= 0; - if_index = opal_ifnext(if_index)) { - opal_ifindextoaddr(if_index, + if_index = opal_ifnext(if_index)) { + opal_ifindextoaddr(if_index, (struct sockaddr*) &if_inaddr, sizeof(if_inaddr)); if (opal_net_samenetwork((struct sockaddr*) &argv_inaddr, @@ -1178,7 +1175,6 @@ static char **split_and_resolve(char **orig_str, char *name) break; } } - /* If we didn't find a match, keep trying */ if (if_index < 0) { orte_show_help("help-oob-tcp.txt", "invalid if_inexclude", diff --git a/orte/mca/oob/tcp/oob_tcp_hdr.h b/orte/mca/oob/tcp/oob_tcp_hdr.h index 1bd4ec66db0..057ec2cb686 100644 --- a/orte/mca/oob/tcp/oob_tcp_hdr.h +++ b/orte/mca/oob/tcp/oob_tcp_hdr.h @@ -5,17 +5,19 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014 -2015 Intel, Inc. All rights reserved. + * * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -54,6 +56,10 @@ typedef struct { mca_oob_tcp_msg_type_t type; /* the rml tag where this message is headed */ orte_rml_tag_t tag; + /* the rml channel where this message is headed */ + orte_rml_channel_num_t channel; + /* the seq number of this message */ + uint32_t seq_num; /* number of bytes in message */ uint32_t nbytes; } mca_oob_tcp_hdr_t; diff --git a/orte/mca/oob/tcp/oob_tcp_sendrecv.c b/orte/mca/oob/tcp/oob_tcp_sendrecv.c index 7f77287ad15..f75827a7f37 100644 --- a/orte/mca/oob/tcp/oob_tcp_sendrecv.c +++ b/orte/mca/oob/tcp/oob_tcp_sendrecv.c @@ -5,25 +5,25 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ * * In windows, many of the socket functions return an EWOULDBLOCK * instead of \ things like EAGAIN, EINPROGRESS, etc. It has been * verified that this will \ not conflict with other error codes that - * are returned by these functions \ under UNIX/Linux environments + * are returned by these functions \ under UNIX/Linux environments */ #include "orte_config.h" @@ -102,9 +102,9 @@ static int send_bytes(mca_oob_tcp_peer_t* peer) return ORTE_ERR_WOULD_BLOCK; } /* we hit an error and cannot progress this message */ - opal_output(0, "%s->%s mca_oob_tcp_msg_send_bytes: write failed: %s (%d) [sd = %d]", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), + opal_output(0, "%s->%s mca_oob_tcp_msg_send_bytes: write failed: %s (%d) [sd = %d]", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), opal_socket_errno, peer->sd); @@ -196,7 +196,12 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata) ORTE_NAME_PRINT(&(peer->name))); opal_event_del(&peer->send_event); msg->msg->status = rc; - ORTE_RML_SEND_COMPLETE(msg->msg); + if( NULL == msg->msg->channel) { + ORTE_RML_SEND_COMPLETE(msg->msg); + } + else { + ORTE_QOS_SEND_COMPLETE(msg->msg); + } OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; @@ -223,7 +228,12 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata) ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); msg->msg->status = ORTE_SUCCESS; - ORTE_RML_SEND_COMPLETE(msg->msg); + if( NULL == msg->msg->channel) { + ORTE_RML_SEND_COMPLETE(msg->msg); + } + else { + ORTE_QOS_SEND_COMPLETE(msg->msg); + } OBJ_RELEASE(msg); peer->send_msg = NULL; } else if (NULL != msg->msg->data) { @@ -258,7 +268,12 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata) ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); msg->msg->status = ORTE_SUCCESS; - ORTE_RML_SEND_COMPLETE(msg->msg); + if( NULL == msg->msg->channel) { + ORTE_RML_SEND_COMPLETE(msg->msg); + } + else { + ORTE_QOS_SEND_COMPLETE(msg->msg); + } OBJ_RELEASE(msg); peer->send_msg = NULL; } @@ -275,7 +290,12 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata) ORTE_NAME_PRINT(&(peer->name)), peer->sd); opal_event_del(&peer->send_event); msg->msg->status = rc; - ORTE_RML_SEND_COMPLETE(msg->msg); + if( NULL == msg->msg->channel) { + ORTE_RML_SEND_COMPLETE(msg->msg); + } + else { + ORTE_QOS_SEND_COMPLETE(msg->msg); + } OBJ_RELEASE(msg); peer->send_msg = NULL; ORTE_FORCED_TERMINATE(1); @@ -293,7 +313,7 @@ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata) peer->send_msg = (mca_oob_tcp_send_t*) opal_list_remove_first(&peer->send_queue); } - + /* if nothing else to do unregister for send event notifications */ if (NULL == peer->send_msg && peer->send_ev_active) { opal_event_del(&peer->send_event); @@ -344,7 +364,7 @@ static int read_bytes(mca_oob_tcp_peer_t* peer) * to abort this message */ opal_output_verbose(OOB_TCP_DEBUG_FAIL, orte_oob_base_framework.framework_output, - "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)", + "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), @@ -359,7 +379,7 @@ static int read_bytes(mca_oob_tcp_peer_t* peer) * and let the caller know */ opal_output_verbose(OOB_TCP_DEBUG_FAIL, orte_oob_base_framework.framework_output, - "%s-%s mca_oob_tcp_msg_recv: peer closed connection", + "%s-%s mca_oob_tcp_msg_recv: peer closed connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* stop all events */ @@ -554,9 +574,12 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s DELIVERING TO RML", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + "%s DELIVERING TO RML tag = %d channel = %d seq_num = %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + peer->recv_msg->hdr.tag, peer->recv_msg->hdr.channel, + peer->recv_msg->hdr.seq_num); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, + peer->recv_msg->hdr.channel, peer->recv_msg->hdr.seq_num, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); @@ -572,6 +595,8 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) snd->origin = peer->recv_msg->hdr.origin; snd->tag = peer->recv_msg->hdr.tag; snd->data = peer->recv_msg->data; + snd->dst_channel = peer->recv_msg->hdr.channel; + snd->seq_num = peer->recv_msg->hdr.seq_num; snd->count = peer->recv_msg->hdr.nbytes; snd->cbfunc.iov = NULL; snd->cbdata = NULL; @@ -600,8 +625,8 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) } } break; - default: - opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: invalid socket state(%d)", + default: + opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); diff --git a/orte/mca/oob/tcp/oob_tcp_sendrecv.h b/orte/mca/oob/tcp/oob_tcp_sendrecv.h index e1d27e19031..d8ac555b966 100644 --- a/orte/mca/oob/tcp/oob_tcp_sendrecv.h +++ b/orte/mca/oob/tcp/oob_tcp_sendrecv.h @@ -5,18 +5,18 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -107,16 +107,19 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t); mca_oob_tcp_send_t *msg; \ int i; \ opal_output_verbose(5, orte_oob_base_framework.framework_output, \ - "%s:[%s:%d] queue send to %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ - __FILE__, __LINE__, \ - ORTE_NAME_PRINT(&((m)->dst))); \ + "%s:[%s:%d] queue send to %s channel =%d", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + __FILE__, __LINE__, \ + ORTE_NAME_PRINT(&((m)->dst)), \ + (m)->dst_channel); \ msg = OBJ_NEW(mca_oob_tcp_send_t); \ /* setup the header */ \ msg->hdr.origin = (m)->origin; \ msg->hdr.dst = (m)->dst; \ msg->hdr.type = MCA_OOB_TCP_USER; \ msg->hdr.tag = (m)->tag; \ + msg->hdr.channel = (m)->dst_channel; \ + msg->hdr.seq_num = (m)->seq_num; \ /* point to the actual message */ \ msg->msg = (m); \ /* set the total number of bytes to be sent */ \ @@ -160,6 +163,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t); msg->hdr.dst = (m)->dst; \ msg->hdr.type = MCA_OOB_TCP_USER; \ msg->hdr.tag = (m)->tag; \ + msg->hdr.channel = (m)->dst_channel; \ + msg->hdr.seq_num = (m)->seq_num; \ /* point to the actual message */ \ msg->msg = (m); \ /* set the total number of bytes to be sent */ \ diff --git a/orte/mca/oob/ud/oob_ud_recv.c b/orte/mca/oob/ud/oob_ud_recv.c index 76084883110..8dc7e4d5001 100644 --- a/orte/mca/oob/ud/oob_ud_recv.c +++ b/orte/mca/oob/ud/oob_ud_recv.c @@ -4,6 +4,7 @@ * reserved. * 2014 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -80,7 +81,8 @@ int mca_oob_ud_get_recv_req (const orte_process_name_t name, const int tag, req->req_origin = name; req->req_tag = tag; - + req->req_channel = ORTE_RML_INVALID_CHANNEL_NUM; + req->req_seq_num = 0; /* this receive was not expected */ req->type = MCA_OOB_UD_REQ_RECV; diff --git a/orte/mca/oob/ud/oob_ud_req.c b/orte/mca/oob/ud/oob_ud_req.c index 9c510240735..3018fc75ba0 100644 --- a/orte/mca/oob/ud/oob_ud_req.c +++ b/orte/mca/oob/ud/oob_ud_req.c @@ -4,6 +4,7 @@ * reserved. * 2014 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -286,7 +287,11 @@ void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc) case MCA_OOB_UD_REQ_SEND: if (req->req_data_type != MCA_OOB_UD_REQ_TR) { req->rml_msg->status = rc; - ORTE_RML_SEND_COMPLETE(req->rml_msg); + if( NULL == req->rml_msg->channel) { + ORTE_RML_SEND_COMPLETE(req->rml_msg); + } else { + ORTE_QOS_SEND_COMPLETE(req->rml_msg); + } } break; case MCA_OOB_UD_REQ_RECV: @@ -302,11 +307,11 @@ void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc) memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len); datalen += req->req_data.iov.uiov[i].iov_len; } - ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, data, datalen); + ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_channel, req->req_seq_num, data, datalen); free(data); } else { - ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, - req->req_data.buf.p, req->req_data.buf.size); + ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_channel, req->req_seq_num, + req->req_data.buf.p, req->req_data.buf.size); } } else { opal_output_verbose(1, orte_oob_base_framework.framework_output, @@ -318,7 +323,8 @@ void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc) snd->dst = req->req_target; snd->origin = req->req_origin; snd->tag = req->req_tag; - + snd->dst_channel = req->req_channel; + snd->seq_num = req->req_seq_num; if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec)); int datalen = 0; diff --git a/orte/mca/oob/ud/oob_ud_req.h b/orte/mca/oob/ud/oob_ud_req.h index b718ed758ee..8fb8bd26afb 100644 --- a/orte/mca/oob/ud/oob_ud_req.h +++ b/orte/mca/oob/ud/oob_ud_req.h @@ -4,6 +4,7 @@ * reserved. * 2014 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -154,6 +155,8 @@ struct mca_oob_ud_req_t { }req_data; int req_tag; + int req_channel; + int req_seq_num; int req_rc; void *req_cbdata; diff --git a/orte/mca/oob/ud/oob_ud_send.c b/orte/mca/oob/ud/oob_ud_send.c index 748fcfe36cd..fc3ae3db824 100644 --- a/orte/mca/oob/ud/oob_ud_send.c +++ b/orte/mca/oob/ud/oob_ud_send.c @@ -4,6 +4,7 @@ * reserved. * 2014 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -107,7 +108,11 @@ static int mca_oob_ud_send_self (orte_rml_send_t *msg) req->rml_msg->status = ORTE_SUCCESS; - ORTE_RML_SEND_COMPLETE(req->rml_msg); + if( NULL == req->rml_msg->channel) { + ORTE_RML_SEND_COMPLETE(req->rml_msg); + } else { + ORTE_QOS_SEND_COMPLETE(req->rml_msg); + } return size; } @@ -165,6 +170,8 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata) send_req->req_target = op->msg->dst; send_req->req_origin = op->msg->origin; send_req->req_tag = op->msg->tag; + send_req->req_channel = op->msg->dst_channel; + send_req->req_seq_num = op->msg->seq_num; if (op->msg->data != NULL) { size = op->msg->count; diff --git a/orte/mca/oob/usock/oob_usock_component.c b/orte/mca/oob/usock/oob_usock_component.c index 67e0437d866..bf9b3128934 100644 --- a/orte/mca/oob/usock/oob_usock_component.c +++ b/orte/mca/oob/usock/oob_usock_component.c @@ -24,7 +24,7 @@ * In windows, many of the socket functions return an EWOULDBLOCK * instead of things like EAGAIN, EINPROGRESS, etc. It has been * verified that this will not conflict with other error codes that - * are returned by these functions under UNIX/Linux environments + * are returned by these functions under UNIX/Linux environments */ #include "orte_config.h" @@ -251,9 +251,9 @@ static int component_send(orte_rml_send_t *msg) orte_proc_t *proc; opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:usock:send_nb to peer %s:%d", + "%s oob:usock:send_nb to peer %s:%d to channel=%d seq_num =%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&msg->dst), msg->tag); + ORTE_NAME_PRINT(&msg->dst), msg->tag, msg->dst_channel, msg->seq_num); if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { /* daemons can only reach local procs */ diff --git a/orte/mca/oob/usock/oob_usock_connection.c b/orte/mca/oob/usock/oob_usock_connection.c index ab9ba573cfa..3f247c9bd25 100644 --- a/orte/mca/oob/usock/oob_usock_connection.c +++ b/orte/mca/oob/usock/oob_usock_connection.c @@ -5,21 +5,21 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -94,7 +94,6 @@ static int usock_peer_create_socket(mca_oob_usock_peer_t* peer) "%s oob:usock:peer creating socket to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)))); - peer->sd = socket(PF_UNIX, SOCK_STREAM, 0); if (peer->sd < 0) { @@ -120,7 +119,7 @@ static int usock_peer_create_socket(mca_oob_usock_peer_t* peer) /* setup the socket as non-blocking */ if (peer->sd >= 0) { if ((flags = fcntl(peer->sd, F_GETFL, 0)) < 0) { - opal_output(0, "%s-%s usock_peer_connect: fcntl(F_GETFL) failed: %s (%d)\n", + opal_output(0, "%s-%s usock_peer_connect: fcntl(F_GETFL) failed: %s (%d)\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), @@ -128,7 +127,7 @@ static int usock_peer_create_socket(mca_oob_usock_peer_t* peer) } else { flags |= O_NONBLOCK; if(fcntl(peer->sd, F_SETFL, flags) < 0) - opal_output(0, "%s-%s usock_peer_connect: fcntl(F_SETFL) failed: %s (%d)\n", + opal_output(0, "%s-%s usock_peer_connect: fcntl(F_SETFL) failed: %s (%d)\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), @@ -248,7 +247,6 @@ void mca_oob_usock_peer_try_connect(int fd, short args, void *cbdata) "Connection across to proc %s succeeded", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); - /* setup our recv to catch the return ack call */ if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); @@ -259,7 +257,7 @@ void mca_oob_usock_peer_try_connect(int fd, short args, void *cbdata) if (ORTE_SUCCESS == (rc = usock_peer_send_connect_ack(peer))) { peer->state = MCA_OOB_USOCK_CONNECT_ACK; } else { - opal_output(0, + opal_output(0, "%s orte_usock_peer_try_connect: " "usock_peer_send_connect_ack to proc %s failed: %s (%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -279,7 +277,7 @@ static int usock_peer_send_connect_ack(mca_oob_usock_peer_t* peer) size_t sdsize; char *cred; size_t credsize; - + opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s SEND CONNECT ACK", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); @@ -290,6 +288,8 @@ static int usock_peer_send_connect_ack(mca_oob_usock_peer_t* peer) hdr.dst = peer->name; hdr.type = MCA_OOB_USOCK_IDENT; hdr.tag = 0; + hdr.channel = 0xffffffff; + hdr.seq_num = 0; /* get our security credential*/ if (OPAL_SUCCESS != (rc = opal_sec.get_my_credential(peer->auth_method, @@ -314,7 +314,7 @@ static int usock_peer_send_connect_ack(mca_oob_usock_peer_t* peer) memcpy(msg+sizeof(hdr), orte_version_string, strlen(orte_version_string)); memcpy(msg+sizeof(hdr)+strlen(orte_version_string)+1, cred, credsize); free(cred); - + if (ORTE_SUCCESS != usock_peer_send_blocking(peer, peer->sd, msg, sdsize)) { ORTE_ERROR_LOG(ORTE_ERR_UNREACH); free(msg); @@ -341,7 +341,6 @@ static void usock_peer_event_init(mca_oob_usock_peer_t* peer) opal_event_del(&peer->recv_event); peer->recv_ev_active = false; } - opal_event_set(mca_oob_usock_module.ev_base, &peer->send_event, peer->sd, @@ -373,7 +372,7 @@ void mca_oob_usock_peer_complete_connect(mca_oob_usock_peer_t *peer) /* check connect completion status */ if (getsockopt(peer->sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) { - opal_output(0, "%s usock_peer_complete_connect: getsockopt() to %s failed: %s (%d)\n", + opal_output(0, "%s usock_peer_complete_connect: getsockopt() to %s failed: %s (%d)\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), @@ -423,7 +422,7 @@ void mca_oob_usock_peer_complete_connect(mca_oob_usock_peer_t *peer) "setting read event on connection to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); - + if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); peer->recv_ev_active = true; @@ -803,8 +802,8 @@ static bool usock_peer_recv_blocking(mca_oob_usock_peer_t* peer, /* socket is non-blocking so handle errors */ if (retval < 0) { - if (opal_socket_errno != EINTR && - opal_socket_errno != EAGAIN && + if (opal_socket_errno != EINTR && + opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) { if (peer->state == MCA_OOB_USOCK_CONNECT_ACK) { /* If we overflow the listen backlog, it's @@ -828,7 +827,7 @@ static bool usock_peer_recv_blocking(mca_oob_usock_peer_t* peer, (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&(peer->name))); return false; } else { - opal_output(0, + opal_output(0, "%s usock_peer_recv_blocking: " "recv() failed for %s: %s (%d)\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -869,11 +868,10 @@ void mca_oob_usock_peer_dump(mca_oob_usock_peer_t* peer, const char* msg) strerror(opal_socket_errno), opal_socket_errno); } - #if defined(USOCK_NODELAY) optlen = sizeof(nodelay); if (getsockopt(peer->sd, IPPROTO_USOCK, USOCK_NODELAY, (char *)&nodelay, &optlen) < 0) { - opal_output(0, "usock_peer_dump: USOCK_NODELAY option: %s (%d)\n", + opal_output(0, "usock_peer_dump: USOCK_NODELAY option: %s (%d)\n", strerror(opal_socket_errno), opal_socket_errno); } diff --git a/orte/mca/oob/usock/oob_usock_hdr.h b/orte/mca/oob/usock/oob_usock_hdr.h index 3ee83967733..c7cad2d998b 100644 --- a/orte/mca/oob/usock/oob_usock_hdr.h +++ b/orte/mca/oob/usock/oob_usock_hdr.h @@ -5,18 +5,18 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -48,6 +48,10 @@ typedef struct { mca_oob_usock_msg_type_t type; /* the rml tag where this message is headed */ orte_rml_tag_t tag; + /* the rml channel to which this message is headed */ + orte_rml_channel_num_t channel; + /* msg seq number on the src channel */ + uint32_t seq_num; /* number of bytes in message */ uint32_t nbytes; } mca_oob_usock_hdr_t; diff --git a/orte/mca/oob/usock/oob_usock_sendrecv.c b/orte/mca/oob/usock/oob_usock_sendrecv.c index 11817eeeff7..b07e42956a3 100644 --- a/orte/mca/oob/usock/oob_usock_sendrecv.c +++ b/orte/mca/oob/usock/oob_usock_sendrecv.c @@ -5,25 +5,25 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ * * In windows, many of the socket functions return an EWOULDBLOCK * instead of \ things like EAGAIN, EINPROGRESS, etc. It has been * verified that this will \ not conflict with other error codes that - * are returned by these functions \ under UNIX/Linux environments + * are returned by these functions \ under UNIX/Linux environments */ #include "orte_config.h" @@ -97,9 +97,9 @@ static int send_bytes(mca_oob_usock_peer_t* peer) return ORTE_ERR_WOULD_BLOCK; } /* we hit an error and cannot progress this message */ - opal_output(0, "%s->%s mca_oob_usock_msg_send_bytes: write failed: %s (%d) [sd = %d]", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), + opal_output(0, "%s->%s mca_oob_usock_msg_send_bytes: write failed: %s (%d) [sd = %d]", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), opal_socket_errno, peer->sd); @@ -187,7 +187,12 @@ void mca_oob_usock_send_handler(int sd, short flags, void *cbdata) opal_event_del(&peer->send_event); peer->send_ev_active = false; msg->msg->status = rc; - ORTE_RML_SEND_COMPLETE(msg->msg); + if( NULL == msg->msg->channel) { + ORTE_RML_SEND_COMPLETE(msg->msg); + } + else { + ORTE_QOS_SEND_COMPLETE(msg->msg); + } OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; @@ -205,7 +210,12 @@ void mca_oob_usock_send_handler(int sd, short flags, void *cbdata) ORTE_NAME_PRINT(&(peer->name)), msg->hdr.nbytes, peer->sd); msg->msg->status = ORTE_SUCCESS; - ORTE_RML_SEND_COMPLETE(msg->msg); + if( NULL == msg->msg->channel) { + ORTE_RML_SEND_COMPLETE(msg->msg); + } + else { + ORTE_QOS_SEND_COMPLETE(msg->msg); + } OBJ_RELEASE(msg); peer->send_msg = NULL; } else if (NULL != msg->msg->data) { @@ -236,7 +246,12 @@ void mca_oob_usock_send_handler(int sd, short flags, void *cbdata) ORTE_NAME_PRINT(&(peer->name)), msg->hdr.nbytes, peer->sd); msg->msg->status = ORTE_SUCCESS; - ORTE_RML_SEND_COMPLETE(msg->msg); + if( NULL == msg->msg->channel) { + ORTE_RML_SEND_COMPLETE(msg->msg); + } + else { + ORTE_QOS_SEND_COMPLETE(msg->msg); + } OBJ_RELEASE(msg); peer->send_msg = NULL; } @@ -254,7 +269,12 @@ void mca_oob_usock_send_handler(int sd, short flags, void *cbdata) opal_event_del(&peer->send_event); peer->send_ev_active = false; msg->msg->status = rc; - ORTE_RML_SEND_COMPLETE(msg->msg); + if( NULL == msg->msg->channel) { + ORTE_RML_SEND_COMPLETE(msg->msg); + } + else { + ORTE_QOS_SEND_COMPLETE(msg->msg); + } OBJ_RELEASE(msg); peer->send_msg = NULL; ORTE_FORCED_TERMINATE(1); @@ -272,7 +292,6 @@ void mca_oob_usock_send_handler(int sd, short flags, void *cbdata) peer->send_msg = (mca_oob_usock_send_t*) opal_list_remove_first(&peer->send_queue); } - /* if nothing else to do unregister for send event notifications */ if (NULL == peer->send_msg && peer->send_ev_active) { opal_event_del(&peer->send_event); @@ -320,7 +339,7 @@ static int read_bytes(mca_oob_usock_peer_t* peer) * to abort this message */ opal_output_verbose(OOB_USOCK_DEBUG_FAIL, orte_oob_base_framework.framework_output, - "%s-%s mca_oob_usock_msg_recv: readv failed: %s (%d)", + "%s-%s mca_oob_usock_msg_recv: readv failed: %s (%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), @@ -335,7 +354,7 @@ static int read_bytes(mca_oob_usock_peer_t* peer) * and let the caller know */ opal_output_verbose(OOB_USOCK_DEBUG_FAIL, orte_oob_base_framework.framework_output, - "%s-%s mca_oob_usock_msg_recv: peer closed connection", + "%s-%s mca_oob_usock_msg_recv: peer closed connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* stop all events */ @@ -507,9 +526,10 @@ void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata) peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, - "%s DELIVERING TO RML", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + "%s DELIVERING TO RML", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, + peer->recv_msg->hdr.channel, peer->recv_msg->hdr.seq_num, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); @@ -525,6 +545,8 @@ void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata) snd->origin = peer->recv_msg->hdr.origin; snd->tag = peer->recv_msg->hdr.tag; snd->data = peer->recv_msg->data; + snd->dst_channel = peer->recv_msg->hdr.channel; + snd->seq_num = peer->recv_msg->hdr.seq_num; snd->count = peer->recv_msg->hdr.nbytes; snd->cbfunc.iov = NULL; snd->cbdata = NULL; @@ -553,8 +575,8 @@ void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata) } } break; - default: - opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)", + default: + opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); diff --git a/orte/mca/oob/usock/oob_usock_sendrecv.h b/orte/mca/oob/usock/oob_usock_sendrecv.h index c704c4f89f9..65658da08c7 100644 --- a/orte/mca/oob/usock/oob_usock_sendrecv.h +++ b/orte/mca/oob/usock/oob_usock_sendrecv.h @@ -5,18 +5,18 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2013 Los Alamos National Security, LLC. + * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -123,6 +123,8 @@ OBJ_CLASS_DECLARATION(mca_oob_usock_recv_t); msg->hdr.dst = (m)->dst; \ msg->hdr.type = MCA_OOB_USOCK_USER; \ msg->hdr.tag = (m)->tag; \ + msg->hdr.channel = (m)->dst_channel; \ + msg->hdr.seq_num = (m)->seq_num; \ /* point to the actual message */ \ msg->msg = (m); \ /* set the total number of bytes to be sent */ \ @@ -164,6 +166,8 @@ OBJ_CLASS_DECLARATION(mca_oob_usock_recv_t); msg->hdr.dst = (m)->dst; \ msg->hdr.type = MCA_OOB_USOCK_USER; \ msg->hdr.tag = (m)->tag; \ + msg->hdr.channel = (m)->dst_channel; \ + msg->hdr.seq_num = (m)->seq_num; \ /* point to the actual message */ \ msg->msg = (m); \ /* set the total number of bytes to be sent */ \ diff --git a/orte/mca/qos/Makefile.am b/orte/mca/qos/Makefile.am new file mode 100644 index 00000000000..b1e55afb922 --- /dev/null +++ b/orte/mca/qos/Makefile.am @@ -0,0 +1,31 @@ +# +# Copyright (c) 2014 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_qos.la +libmca_qos_la_SOURCES = + +# pkgdata setup +dist_ortedata_DATA = + +# local files +headers = qos.h +libmca_qos_la_SOURCES += $(headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +ortedir = $(orteincludedir)/$(subdir) +nobase_orte_HEADERS = $(headers) +endif + +include base/Makefile.am + + +distclean-local: + rm -f base/static-components.h diff --git a/orte/mca/qos/ack/Makefile.am b/orte/mca/qos/ack/Makefile.am new file mode 100644 index 00000000000..6cc61b5598e --- /dev/null +++ b/orte/mca/qos/ack/Makefile.am @@ -0,0 +1,34 @@ +# +# Copyright (c) 2015 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + qos_ack.h \ + qos_ack_component.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_qos_ack_DSO +component_noinst = +component_install = mca_qos_ack.la +else +component_noinst = libmca_qos_ack.la +component_install = +endif + +mcacomponentdir = $(ortelibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_qos_ack_la_SOURCES = $(sources) +mca_qos_ack_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_qos_ack_la_SOURCES = $(sources) +libmca_qos_ack_la_LDFLAGS = -module -avoid-version + diff --git a/orte/mca/qos/ack/qos_ack.h b/orte/mca/qos/ack/qos_ack.h new file mode 100644 index 00000000000..0b0e8ef276a --- /dev/null +++ b/orte/mca/qos/ack/qos_ack.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * QoS Ack Component interface + * + * + * + */ + +#ifndef MCA_QOS_ACK_H +#define MCA_QOS_ACK_H + +#include "orte_config.h" +#include "orte/mca/qos/qos.h" +#include "orte/mca/qos/base/base.h" +#include "opal/class/opal_hotel.h" + +BEGIN_C_DECLS + +#define QOS_ACK_SEQ_NUM_UNINITIALIZED 0 +#define QOS_ACK_MAX_WINDOW 100 +#define QOS_ACK_MAX_OUTSTANDING_MSGS (QOS_ACK_MAX_WINDOW *2) +/* window timeout in secs - 100 seconds ok? + TO DO: make this a QOS attribute that can be specified by the user */ +#define QOS_ACK_WINDOW_TIMEOUT_IN_SECS 1 +#define ACK_WINDOW_COMPLETE 0 +#define ACK_TIMEOUT 1 +#define ACK_OUT_OF_ORDER 2 +#define ACK_RECV_MISSED_MSG 3 /* received previously missed msgs*/ + +typedef enum { + orte_qos_ack_channel_state_inactive = 0, + orte_qos_ack_channel_state_filling_window = 1, + orte_qos_ack_channel_state_window_completed = 2, + orte_qos_ack_channel_state_awaiting_ack = 3, + orte_qos_ack_channel_state_received_ack = 4, +}orte_qos_ack_channel_state_t ; + +/* Ack Qos channel data structure */ +typedef struct orte_qos_ack_channel { + opal_list_item_t super; + uint32_t channel_num; + // we retain the attributes so we can compare channels - we can get rid of this and compare incoming attributes + // with attributes of interest to this channel type + opal_list_t attributes; + /* size of the message window */ + uint32_t window; + /* window timeout in secs.*/ + uint32_t timeout_secs; + /* retry msg window on ack fail */ + bool retry; + /* seq number of the first msg in the active window */ + uint32_t window_first_seq_num; + /* sequence number of last outgoing msg */ + uint32_t out_msg_seq_num; + /* sequence number of last incoming msg */ + uint32_t in_msg_seq_num; + /* sequence number of the last message acked */ + uint32_t ack_msg_seq_num; + /* ACK outstanding msgs hotel */ + opal_hotel_t outstanding_msgs; + /* array for mapping msg seq num to room num for outgoing msgs in hotels */ + int seq_num_to_room_num[QOS_ACK_MAX_OUTSTANDING_MSGS]; + /* channel state */ + orte_qos_ack_channel_state_t state; + /* window timer event */ + opal_event_t msg_ack_timer_event; +}orte_qos_ack_channel_t; + +OBJ_CLASS_DECLARATION(orte_qos_ack_channel_t); + +extern orte_qos_module_t orte_qos_ack_module; +static inline int orte_qos_ack_channel_get_msg_room (orte_qos_ack_channel_t * ack_chan, + uint32_t seq_num) +{ + return ack_chan->seq_num_to_room_num[(seq_num % QOS_ACK_MAX_OUTSTANDING_MSGS)]; +} + +static inline void orte_qos_ack_channel_set_msg_room (orte_qos_ack_channel_t * ack_chan, + uint32_t seq_num, int room_num) +{ + ack_chan->seq_num_to_room_num[(seq_num % QOS_ACK_MAX_OUTSTANDING_MSGS)] = room_num; +} + +ORTE_DECLSPEC void orte_qos_ack_msg_ack_timeout_callback (struct opal_hotel_t *hotel, + int room_num, void *occupant); +ORTE_DECLSPEC void orte_qos_ack_msg_window_timeout_callback (int fd, short flags, void *cbdata); +ORTE_DECLSPEC void orte_qos_ack_recv_msg_timeout_callback (struct opal_hotel_t *hotel, + int room_num, void *occupant); +END_C_DECLS + +#endif /* MCA_QOS_ACK_H */ diff --git a/orte/mca/qos/ack/qos_ack_component.c b/orte/mca/qos/ack/qos_ack_component.c new file mode 100644 index 00000000000..d8a3aa224f8 --- /dev/null +++ b/orte/mca/qos/ack/qos_ack_component.c @@ -0,0 +1,710 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "orte/mca/oob/base/base.h" +#include "orte/mca/qos/base/base.h" +#include "orte/mca/qos/qos.h" +#include "qos_ack.h" + +/* ack module functions */ +static int qos_ack_start (void); +static void qos_ack_shutdown (void); +static void* ack_create (opal_list_t *qos_attributes, uint32_t channel_num); +static int ack_open (void *qos_channel, + opal_buffer_t * buf); +static int ack_send ( void *qos_channel, orte_rml_send_t *msg); +static int ack_recv (void *channel, orte_rml_recv_t *msg); +static int ack_close (void * channel); +static int ack_init_recv (void *channel, opal_list_t *attributes); +static int ack_cmp (void *channel, opal_list_t *attributes); +static void ack_send_callback (orte_rml_send_t *msg); + +/* utility functions */ +static inline int send_ack (orte_qos_ack_channel_t * channel, + orte_rml_channel_num_t channel_num, + uint32_t ack_type, + uint32_t last_msg_seq_num); + +void orte_qos_ack_channel_process_ack (int status, orte_process_name_t* sender, + opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata); + +void orte_qos_ack_msg_send_callback ( int status, + orte_process_name_t *peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata); +static inline int process_out_of_order_msg ( orte_qos_ack_channel_t *channel, + orte_rml_recv_t *msg); +/** + * ack module definition + */ +orte_qos_module_t orte_qos_ack_module = { + ack_create, + ack_open, + ack_send, + ack_recv, + ack_close, + ack_init_recv, + ack_cmp, + ack_send_callback +}; + +/** + * component definition + */ +mca_qos_base_component_t mca_qos_ack_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_QOS_BASE_VERSION_2_0_0, + + "ack", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + NULL, + NULL, + }, + qos_ack_start, + qos_ack_shutdown, + orte_qos_ack, + { + ack_create, + ack_open, + ack_send, + ack_recv, + ack_close, + ack_init_recv, + ack_cmp, + ack_send_callback + } +}; + +static int qos_ack_start(void) { + orte_rml.recv_buffer_nb (ORTE_NAME_WILDCARD, ORTE_RML_TAG_MSG_ACK, + ORTE_RML_PERSISTENT, orte_qos_ack_channel_process_ack, + NULL); + /* post a persistent recieve for ACK TAG */ + return ORTE_SUCCESS; +} + +static void qos_ack_shutdown (void) { +} + +static void* ack_create (opal_list_t *qos_attributes, uint32_t channel_num) { + orte_qos_ack_channel_t * ack_chan; + int32_t rc; + uint32_t *type, type_val, *attribute, attribute_val; + type_val = orte_qos_ack; + ack_chan = OBJ_NEW (orte_qos_ack_channel_t); + ack_chan->channel_num = channel_num; + type = &type_val; + attribute = &attribute_val; + /* validate and store ack specific channel attributes */ + /* set channel type */ + if (ORTE_SUCCESS == (rc = orte_set_attribute( &ack_chan->attributes, ORTE_QOS_TYPE, ORTE_ATTR_GLOBAL, (void*)type, OPAL_UINT8))) { + if( orte_get_attribute (qos_attributes, ORTE_QOS_WINDOW_SIZE, (void**)&attribute, OPAL_UINT32)) { + if ( QOS_ACK_MAX_WINDOW < (*attribute)) { + ORTE_ERROR_LOG(OPAL_ERR_VALUE_OUT_OF_BOUNDS); + OBJ_RELEASE(ack_chan); + } + else { + ack_chan->window = *attribute; + if (ORTE_SUCCESS != (rc = orte_set_attribute(&ack_chan->attributes, ORTE_QOS_WINDOW_SIZE, + ORTE_ATTR_GLOBAL, (void*)attribute, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(ack_chan); + } else { + if( orte_get_attribute (qos_attributes, ORTE_QOS_ACK_NACK_TIMEOUT, (void**)&attribute, OPAL_UINT32)) { + ack_chan->timeout_secs = *attribute; + if (ORTE_SUCCESS != (rc = orte_set_attribute(&ack_chan->attributes, ORTE_QOS_ACK_NACK_TIMEOUT, + ORTE_ATTR_GLOBAL, (void*)attribute, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(ack_chan); + } else { + if( orte_get_attribute (qos_attributes, ORTE_QOS_MSG_RETRY, NULL, OPAL_BOOL)) { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_create created channel = %p window = %d timeout =%d retry = %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)ack_chan, + ack_chan->window, + ack_chan->timeout_secs, + ack_chan->retry)); + ack_chan->retry = true; + if (ORTE_SUCCESS != (rc = orte_set_attribute(&ack_chan->attributes, ORTE_QOS_MSG_RETRY, + ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(ack_chan); + } + } else { + ack_chan->retry = false; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_create created channel = %p window = %d timeout =%d retry = %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)ack_chan, + ack_chan->window, + ack_chan->timeout_secs, + ack_chan->retry)); + } + } + }else { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(ack_chan); + } + } + } + }else + OBJ_RELEASE(ack_chan); + }else { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(ack_chan); + } + return ack_chan; +} + +static int ack_open (void *qos_channel, opal_buffer_t * buf) { + int32_t rc = ORTE_SUCCESS; + uint32_t eviction_timeout; + orte_qos_ack_channel_t *ack_chan; + ack_chan = (orte_qos_ack_channel_t*) (qos_channel); + /* TO DO - need to adjust eviction timeout according to window size + lets keep max time out for the first pass */ + eviction_timeout = (ack_chan->timeout_secs + QOS_ACK_WINDOW_TIMEOUT_IN_SECS) * 100000; + /* init outstanding msg hotel */ + opal_hotel_init (&ack_chan->outstanding_msgs, QOS_ACK_MAX_OUTSTANDING_MSGS, + eviction_timeout, 0, + orte_qos_ack_msg_ack_timeout_callback); + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_open channel = %p init hotel timeout =%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)ack_chan, eviction_timeout)); + /* set the message window timer event, but don't activate it */ + /*opal_event_set(opal_event_base, + &ack_chan->msg_window_timer_event, + -1, 0, orte_qos_ack_msg_window_timeout_callback, + ack_chan); + opal_event_set_priority(&ack_chan->msg_window_timer_event, ORTE_MSG_PRI);*/ + /* the Qos module puts the non local attributes to be sent to the peer in a list at the time of create. + pack those attributes into the buffer.*/ + if (ORTE_SUCCESS != (rc = orte_qos_base_pack_attributes(buf, &ack_chan->attributes))) + ORTE_ERROR_LOG(rc); + return rc; +} + +static int ack_send ( void *qos_channel, orte_rml_send_t *msg) { + int32_t room_num; + orte_qos_ack_channel_t *ack_chan = (orte_qos_ack_channel_t*) (qos_channel); + if (ack_chan->out_msg_seq_num == ack_chan->window_first_seq_num -1 ) { + /* begining msg window */ + ack_chan->out_msg_seq_num = ack_chan->window_first_seq_num; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_send msg = %p to peer = %s\n begining window at seq_num = %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, ORTE_NAME_PRINT(&msg->dst), ack_chan->out_msg_seq_num)); + ack_chan->state = orte_qos_ack_channel_state_filling_window; + } + else + ack_chan->out_msg_seq_num++; + if(ack_chan->out_msg_seq_num - ack_chan->window_first_seq_num == ack_chan->window - 1) { + /* we are at the end of the window. */ + /* update state */ + ack_chan->state = orte_qos_ack_channel_state_window_completed; + /* set begin window for next sequence */ + ack_chan->window_first_seq_num = ack_chan->out_msg_seq_num + 1; + } + msg->seq_num = ack_chan->out_msg_seq_num; + /* check msg into hotel */ + if( OPAL_SUCCESS == (opal_hotel_checkin(&ack_chan->outstanding_msgs, msg, &room_num ))) { + /* store room number */ + orte_qos_ack_channel_set_msg_room(ack_chan, msg->seq_num, room_num); + } else { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_send msg = %p to peer = %s returned with error %d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, ORTE_NAME_PRINT(&msg->dst), + ORTE_ERR_QOS_ACK_WINDOW_FULL)); + return ORTE_ERR_QOS_ACK_WINDOW_FULL; + } + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_send msg = %p to peer = %s\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, ORTE_NAME_PRINT(&msg->dst))); + return ORTE_SUCCESS; +} + +static inline int send_ack (orte_qos_ack_channel_t * ack_chan, + orte_rml_channel_num_t channel_num, + uint32_t ack_type, uint32_t last_msg_seq_num) +{ + int rc; + orte_rml_channel_t *rml_channel; + opal_buffer_t *buffer; + uint32_t num_msgs_to_ack = 0; + uint32_t *ack_seq_num_array; + uint32_t i; + rml_channel = orte_rml_base_get_channel (channel_num); + num_msgs_to_ack = ack_chan->in_msg_seq_num - ack_chan->ack_msg_seq_num + 1; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s sending ack type = %d \n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ack_type)); + if ( NULL != (ack_seq_num_array = malloc (sizeof(uint32_t) * num_msgs_to_ack))) { + for (i = 1; i <= num_msgs_to_ack ; i++) { + ack_seq_num_array[i-1] = ack_chan->ack_msg_seq_num + i; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_recv acking msg %d to peer = %s\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ack_seq_num_array[i-1], + ORTE_NAME_PRINT(&rml_channel->peer))); + } + ack_seq_num_array[num_msgs_to_ack - 1] = last_msg_seq_num; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_recv acking last msg %d to peer = %s\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ack_seq_num_array[num_msgs_to_ack - 1], + ORTE_NAME_PRINT(&rml_channel->peer))); + } + else { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_recv cannot allocate ack array to send ack to peer = %s\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&rml_channel->peer))); + rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE; + return rc; + } + buffer = OBJ_NEW (opal_buffer_t); + /* pack channel number */ + opal_dss.pack (buffer, &rml_channel->peer_channel, 1, OPAL_UINT32); + /* pack ack type */ + opal_dss.pack (buffer, &ack_type, 1, OPAL_UINT32); + /* pack num messages */ + opal_dss.pack (buffer, &num_msgs_to_ack, 1, OPAL_UINT32); + /* pack seq number array */ + for (i =0; ipeer, buffer, ORTE_RML_TAG_MSG_ACK, + orte_qos_ack_msg_send_callback, rml_channel); + if(ORTE_SUCCESS == rc) { + /* update last acked msg */ + ack_chan->ack_msg_seq_num = last_msg_seq_num; + } else { + //TO DO + } + return rc; +} + +static inline int process_out_of_order_msg ( orte_qos_ack_channel_t *ack_chan, + orte_rml_recv_t *msg) +{ + int32_t rc, room_num, first_lost_msg_seq_num, num_lost_msgs, i; + orte_rml_recv_t *out_msg; + void *occupant = NULL; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s process_out_of_order_msg msg %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + msg->seq_num)); + /* if this msg is a duplicate - then do nothing */ + if ((orte_qos_ack_channel_get_msg_room(ack_chan, msg->seq_num)) != -1) { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s process_out_of_order_msg msg %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + msg->seq_num)); + rc = ORTE_ERR_DUPLICATE_MSG; + } + else { + opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)msg, &room_num); + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "process_out_of_order_msg checked in msg %d in room %d\n", + msg->seq_num, room_num)); + orte_qos_ack_channel_set_msg_room (ack_chan, msg->seq_num, room_num); + rc = ORTE_ERR_OUT_OF_ORDER_MSG; + /* check if we need to send an ACK */ + if (ack_chan->ack_msg_seq_num <= ack_chan->in_msg_seq_num) { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s process_out_of_order_msg sending ack last seq_num = %d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + msg->seq_num)); + /* send ACK. */ + send_ack (ack_chan, msg->channel_num, ACK_OUT_OF_ORDER, msg->seq_num); + /* stop window ack timer */ + opal_event_evtimer_del (&ack_chan->msg_ack_timer_event); + } + else { + /* if we got a lost msg - any seq num between in_msg_seq_num and ack_seq_num*/ + if (ack_chan->ack_msg_seq_num > msg->seq_num) { + /* check if we have got all lost msgs */ + first_lost_msg_seq_num = ack_chan->in_msg_seq_num + 1; + num_lost_msgs = ack_chan->ack_msg_seq_num - ack_chan->in_msg_seq_num; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s process_out_of_order_msg msg %d first_lost_msg =%d num_lost_msgs =%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + msg->seq_num, first_lost_msg_seq_num, num_lost_msgs)); + for (i =0 ; i < num_lost_msgs; i++) { + if ((orte_qos_ack_channel_get_msg_room(ack_chan, first_lost_msg_seq_num +i)) == -1) + break; + } + if (i == num_lost_msgs) { + + /* we got all the lost msgs so we can complete all the msgs in the hotel now */ + /* reset ack_seq_num */ + ack_chan->ack_msg_seq_num = first_lost_msg_seq_num -1; + room_num = 0; + for ( i = 0; room_num != -1; i++) { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s process_out_of_order_msg got all lost msgs completing outstanding msgs %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (first_lost_msg_seq_num + i))); + /* evict msg and complete it */ + room_num = orte_qos_ack_channel_get_msg_room (ack_chan, first_lost_msg_seq_num +i); + opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant); + orte_qos_ack_channel_set_msg_room(ack_chan, first_lost_msg_seq_num +i, -1); + out_msg = (orte_rml_recv_t *) occupant; + if ((NULL != out_msg) && (room_num != -1)) { + // set in seq num */ + ack_chan->in_msg_seq_num = out_msg->seq_num; + orte_rml_base_complete_recv_msg(&out_msg); + /* completing recv msg to rml */ + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "process_out_of_order_msg completed recv msg %d", + (first_lost_msg_seq_num + i))); + } else { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s process_out_of_order_msg lost msg %d not in hotel", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (first_lost_msg_seq_num + i))); + } + } //end for + /* send ACK */ + send_ack (ack_chan, ack_chan->channel_num, ACK_RECV_MISSED_MSG, + ack_chan->in_msg_seq_num); + } //end if (i== num_lost_msgs) + } // if (ack_chan->ack_msg_seq_num > msg->seq_num) + } //end else + } // end duplicate else + return rc; +} + +static int ack_recv (void *qos_channel, orte_rml_recv_t *msg) { + orte_qos_ack_channel_t *ack_chan; + ack_chan = (orte_qos_ack_channel_t*) (qos_channel); + int32_t rc; + struct timeval ack_timeout; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_recv msg = %p seq_num = %d from peer = %s\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, msg->seq_num, + ORTE_NAME_PRINT(&msg->sender))); + /** HACK - drop every third msg to stimulate lost msg */ + /* if ((msg->seq_num == 3) && (hack == 0)) { + OBJ_RELEASE(msg); + hack = 1; + return ORTE_ERROR; + }*/ + /* check if this is the next expected msg*/ + if((ack_chan->in_msg_seq_num + 1 == msg->seq_num) && (ack_chan->ack_msg_seq_num < msg->seq_num)) + { + /* check if we are at the end of the window */ + if(ack_chan->window == (msg->seq_num - ack_chan->ack_msg_seq_num)) { + /* stop window ack timer */ + opal_event_evtimer_del (&ack_chan->msg_ack_timer_event); + rc = send_ack (ack_chan, msg->channel_num, ACK_WINDOW_COMPLETE, msg->seq_num); + } else { + if(ack_chan->in_msg_seq_num == ack_chan->ack_msg_seq_num) { + /* begining window -start window ack timer */ + ack_timeout.tv_sec = ack_chan->timeout_secs; + ack_timeout.tv_usec = 0; + opal_event_evtimer_add (&ack_chan->msg_ack_timer_event, &ack_timeout); + } + rc = ORTE_SUCCESS; + } + ack_chan->in_msg_seq_num = msg->seq_num; + } + else { + rc = process_out_of_order_msg(ack_chan, msg); + } + return rc; +} + +static int ack_close (void * channel) { + int32_t rc = ORTE_SUCCESS; + orte_qos_ack_channel_t *ack_chan; + ack_chan = (orte_qos_ack_channel_t*) (channel); + /* check if channel is busy (no outstanding msgs */ + if (opal_hotel_is_empty (&ack_chan->outstanding_msgs)) { + /* no outstanding msgs, release channel */ + OBJ_RELEASE(ack_chan); + rc = ORTE_SUCCESS; + } else + rc = ORTE_ERR_CHANNEL_BUSY; + return rc; +} + +static int ack_init_recv (void *channel, opal_list_t *attributes) { + int32_t rc = ORTE_SUCCESS; + uint32_t eviction_timeout; + orte_qos_ack_channel_t *ack_chan; + ack_chan = (orte_qos_ack_channel_t*) channel; + /* TO DO - need to adjust eviction timeout according to window size + lets keep max time out for the first pass */ + eviction_timeout = (ack_chan->timeout_secs + QOS_ACK_WINDOW_TIMEOUT_IN_SECS) * 100000; + /* init outstanding msg hotel */ + opal_hotel_init (&ack_chan->outstanding_msgs, QOS_ACK_MAX_OUTSTANDING_MSGS, + eviction_timeout, 0, + orte_qos_ack_recv_msg_timeout_callback); + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_open channel = %p init hotel timeout =%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)ack_chan, eviction_timeout)); + opal_event_evtimer_set (orte_event_base, &ack_chan->msg_ack_timer_event, + orte_qos_ack_msg_window_timeout_callback, (void *) ack_chan); + return rc; +} + +static int ack_cmp (void *channel, opal_list_t *attributes) { + return false; + +} + +static void ack_send_callback (orte_rml_send_t *msg) +{ + orte_qos_ack_channel_t *ack_chan; + /* complete the request back to the user only upon receiving the ack + nothing to do here, just make sure that the request is in the hotel */ + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_send_callback for msg = %p seq num =%d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, msg->seq_num)); + ack_chan = (orte_qos_ack_channel_t *) msg->channel->qos_channel_ptr; + /* if msg->status != SUCCESS - then evict all messages in the window and + complete them?? */ + if(ORTE_SUCCESS == msg->status) { + // nothing to do + assert((orte_qos_ack_channel_get_msg_room(ack_chan, msg->seq_num)) != -1); + } else { + // TO DO : error handling + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s ack_send_callback for msg = %p seq num =%d SEND FAILED status = %d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, msg->seq_num, msg->status)); + /* evict message from hotel and send end of window to receiver?? */ + + } +} + +void orte_qos_ack_msg_ack_timeout_callback (struct opal_hotel_t *hotel, + int room_num, void *occupant) +{ + orte_rml_send_t *msg; + orte_qos_ack_channel_t *ack_chan; + msg = (orte_rml_send_t *) occupant; + ack_chan = (orte_qos_ack_channel_t*) msg->channel->qos_channel_ptr; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s orte_qos_ack_msg_ack_timeout_callback for msg = %p seq num =%d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, msg->seq_num)); + /* for now complete only the msg that timed out + TO DO : handle the completion of all messages in the window */ + msg->status = ORTE_ERR_ACK_TIMEOUT_SENDER; + // set room num to -1 for the msg's seq number + orte_qos_ack_channel_set_msg_room (ack_chan, msg->seq_num , -1); + // complete the msg + ORTE_RML_SEND_COMPLETE(msg); +} + +void orte_qos_ack_recv_msg_timeout_callback (struct opal_hotel_t *hotel, + int room_num, void *occupant) +{ + orte_rml_recv_t *msg = (orte_rml_recv_t *) occupant; +#if 0 + orte_qos_ack_channel_t *ack_chan; + orte_rml_channel_t *channel; + + channel = orte_rml_base_get_channel(msg->channel_num); + ack_chan = (orte_qos_ack_channel_t*) channel->qos_channel_ptr; +#endif + + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s OOPS received msg = %p seq num =%d timed out on ACK Queue\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, msg->seq_num)); + /* Need to determine correct action here as the sender hasn't responded yet to + a lost msg event */ + /* This is highly unlikely - lets assert to enable debug*/ + assert(0); + /* + // set room num to -1 for the msg's seq number + ack_chan->seq_num_to_room_num[msg->seq_num % QOS_ACK_MAX_OUTSTANDING_MSGS] = -1; + // complete the msg + ORTE_RML_REACTIVATE_MESSAGE(msg);*/ +} + +void orte_qos_ack_channel_process_ack (int status, orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata) +{ + /* process ack received for the msg */ + uint32_t num_msgs_acked, channel_num, i; + int32_t num_values, room_num; + orte_rml_send_t *msg, *missed_msg; + void *occupant = NULL; + orte_rml_channel_t *channel; + orte_qos_ack_channel_t *ack_chan; + uint32_t *seq_num_array; + uint32_t ack_type; + uint32_t missed_msg_seq_num = 0; + num_values = 1; + /* unpack channel number first */ + opal_dss.unpack(buffer, (void*) &channel_num, &num_values, OPAL_UINT32); + OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, + "orte_qos_ack_channel_process_ack recieved ack on channel = %d", + channel_num)); + channel = orte_rml_base_get_channel (channel_num); + if ((NULL != channel) || (NULL != channel->qos_channel_ptr)) { + ack_chan = (orte_qos_ack_channel_t *) (channel->qos_channel_ptr); + seq_num_array = malloc (sizeof(uint32_t) * ack_chan->window); + num_values = 1; + /* unpack ack type */ + opal_dss.unpack(buffer, (void*) &ack_type, &num_values, OPAL_UINT32); + num_values = 1; + /* unpack num messages acked */ + opal_dss.unpack(buffer, (void*) &num_msgs_acked, &num_values, OPAL_UINT32); + OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, + "orte_qos_ack_channel_process_ack recieved ack type %d for %d msgs on channel = %d", + ack_type, num_msgs_acked, channel_num)); + if (ACK_OUT_OF_ORDER != ack_type) { + //handle normal ACK + for (i = 0; i < num_msgs_acked; i++) + { + opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32); + room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]); + opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant); + orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1); + if((occupant != NULL) && (room_num != -1)) { + msg = (orte_rml_send_t*) occupant; + OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, + "Releasing sent message with tag %d and seq_num %d after receiving Ack from dest ", + msg->tag, msg->seq_num )); + msg->status = ORTE_SUCCESS; + ORTE_RML_SEND_COMPLETE(msg); + } else { + OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, + "OOPS received an ACK for already completed seq_num =%d ", + seq_num_array[i] )); + } + } + } else { + // handle out of order ACK - complete msgs received in order, retry the lost msg. + for (i = 0; i < num_msgs_acked; i++) + { + opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32); + room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]); + opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant); + orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1); + if ((NULL != occupant) && ((i == 0 )|| (seq_num_array[i] == seq_num_array[i-1] +1 ))) { + msg = (orte_rml_send_t*) occupant; + msg->status = ORTE_SUCCESS; + ORTE_RML_SEND_COMPLETE(msg); + } else { + if (NULL != occupant) { + // num_missed_msgs = (seq_num_array[i] - seq_num_array [i-1] - 1); + assert( i == num_msgs_acked -1); + /* recheck the ith msg */ + opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)occupant, &room_num); + orte_qos_ack_channel_set_msg_room (ack_chan, seq_num_array[i], room_num); + /* resend and recheck all the missed msgs*/ + missed_msg_seq_num = seq_num_array[i-1] + 1; + for (; missed_msg_seq_num < seq_num_array[i]; missed_msg_seq_num++) { + room_num = orte_qos_ack_channel_get_msg_room (ack_chan, missed_msg_seq_num); + opal_hotel_checkout_and_return_occupant (&ack_chan->outstanding_msgs, room_num, &occupant); + assert ( NULL != occupant); + missed_msg = (orte_rml_send_t*) occupant; + missed_msg->status = ORTE_ERR_LOST_MSG_IN_WINDOW; + opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)missed_msg, &room_num); + orte_qos_ack_channel_set_msg_room (ack_chan, missed_msg_seq_num, room_num); + /* send this out on wire directly */ + ORTE_OOB_SEND (missed_msg); + } //end for + } else { + OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, + "OOPS received an ACK for already completed seq_num =%d ", + seq_num_array[i] )); + }//end if (NULL != occupant) + } //end else + } // end for + }//end out of order ack processing + free(seq_num_array); + }else { + OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, + "orte_qos_ack_channel_msg_ack_recv_callback recieved ack on non existent channel = %d", + channel_num)); + } +} + + +void orte_qos_ack_msg_send_callback ( int status, + orte_process_name_t *peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + orte_rml_channel_t *channel = (orte_rml_channel_t*) cbdata; + OPAL_OUTPUT_VERBOSE ((0, orte_qos_base_framework.framework_output, + " orte_qos_ack_msg_send_callback channel num =%d status =%d", + channel->channel_num, status)); +} + +void orte_qos_ack_msg_window_timeout_callback (int fd, short flags, void *cbdata) +{ + // int32_t rc; + orte_qos_ack_channel_t *ack_chan = (orte_qos_ack_channel_t*) cbdata; + OPAL_OUTPUT_VERBOSE ((0, orte_qos_base_framework.framework_output, + " orte_qos_ack_msg_window_timeout_callback for channel = %p last acked seq num = %d, last received seq num =%d", + (void*)ack_chan, ack_chan->ack_msg_seq_num, ack_chan->in_msg_seq_num )); + /* send ack message */ + send_ack(ack_chan, ack_chan->channel_num, ACK_TIMEOUT, ack_chan->in_msg_seq_num); + +} + + + +/*** ACK QOS CLASS INSTANCES ***/ + +static void channel_cons (orte_qos_ack_channel_t *ptr) +{ + int i; + OBJ_CONSTRUCT (&ptr->attributes, opal_list_t); + ptr->out_msg_seq_num = 0; + ptr->window_first_seq_num = 1; + ptr->in_msg_seq_num = 0; + ptr->ack_msg_seq_num = 0; + /* init seq num to room num array to -1 */ + for (i =0; i< QOS_ACK_MAX_OUTSTANDING_MSGS; i++) + ptr->seq_num_to_room_num[i] = -1; + OBJ_CONSTRUCT (&ptr->outstanding_msgs, opal_hotel_t); + ptr->state = orte_qos_ack_channel_state_inactive; +} +static void channel_des (orte_qos_ack_channel_t *ptr) +{ + // OPAL_LIST_DESTRUCT(&ptr->attributes); + //OBJ_DESTRUCT (&ptr->outstanding_msgs); +} +OBJ_CLASS_INSTANCE (orte_qos_ack_channel_t, + opal_list_item_t, + channel_cons, channel_des); diff --git a/orte/mca/qos/base/Makefile.am b/orte/mca/qos/base/Makefile.am new file mode 100644 index 00000000000..0f86a631ad6 --- /dev/null +++ b/orte/mca/qos/base/Makefile.am @@ -0,0 +1,18 @@ +# +# Copyright (c) 2014 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_ortedata_DATA += base/help-qos-base.txt + +headers += \ + base/base.h + +libmca_qos_la_SOURCES += \ + base/qos_base_frame.c \ + base/qos_base_select.c \ + base/qos_base_channel_handlers.c diff --git a/orte/mca/qos/base/base.h b/orte/mca/qos/base/base.h new file mode 100644 index 00000000000..d0918b5338f --- /dev/null +++ b/orte/mca/qos/base/base.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * QoS Framework maintenence interface + * + * + * + */ + +#ifndef MCA_QOS_BASE_H +#define MCA_QOS_BASE_H + +#include "orte_config.h" +#include "orte/mca/qos/qos.h" +#include "orte/mca/rml/base/base.h" +#include "opal/class/opal_list.h" + + +/* + * MCA Framework + */ +ORTE_DECLSPEC extern mca_base_framework_t orte_qos_base_framework; +/* select a component */ +ORTE_DECLSPEC int orte_qos_base_select(void); + +/* a global struct containing framework-level values */ +typedef struct { + opal_list_t open_channels; + opal_pointer_array_t actives; +#if OPAL_ENABLE_TIMING + bool timing; +#endif +} orte_qos_base_t; +ORTE_DECLSPEC extern orte_qos_base_t orte_qos_base; + +#define ORTE_QOS_MAX_WINDOW_SIZE 1000 + +typedef struct orte_qos_base_channel { + opal_list_item_t super; + uint32_t channel_num; + opal_list_t attributes; +} orte_qos_base_channel_t; +OBJ_CLASS_DECLARATION(orte_qos_base_channel_t); + +/* common implementations */ +ORTE_DECLSPEC void* orte_qos_get_module ( opal_list_t *qos_attributes); +int orte_qos_base_pack_attributes (opal_buffer_t * buffer, opal_list_t * qos_attributes); + +#define ORTE_QOS_SEND_COMPLETE(m) \ + do { \ + orte_qos_module_t *mod; \ + opal_output_verbose(5, orte_qos_base_framework.framework_output, \ + "%s-%s Send message complete at %s:%d", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + ORTE_NAME_PRINT(&((m)->dst)), \ + __FILE__, __LINE__); \ + mod = (orte_qos_module_t*) m->channel->qos; \ + if (NULL != mod) \ + mod->send_callback(m); \ + else \ + ORTE_RML_SEND_COMPLETE(m); \ + } while(0); + +END_C_DECLS + +#endif /* MCA_QOS_BASE_H */ diff --git a/orte/mca/qos/base/help-qos-base.txt b/orte/mca/qos/base/help-qos-base.txt new file mode 100644 index 00000000000..cfa4b6cc2e2 --- /dev/null +++ b/orte/mca/qos/base/help-qos-base.txt @@ -0,0 +1,12 @@ +# -*- text -*- +# +# Copyright (c) 2014 Intel, Inc. All rights reserved +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# +[no-qos-avail] +No Qos protocols available. diff --git a/orte/mca/qos/base/qos_base_channel_handlers.c b/orte/mca/qos/base/qos_base_channel_handlers.c new file mode 100644 index 00000000000..1f8cdccbff3 --- /dev/null +++ b/orte/mca/qos/base/qos_base_channel_handlers.c @@ -0,0 +1,165 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * qos_base_channel_handlers.c - contains base functions handlers for open, send and close channel requests. + */ + +/* + * includes + */ +#include "orte_config.h" + +#ifdef HAVE_STRING_H +#include +#endif + +#include "orte/constants.h" +#include "orte/types.h" + +#include "opal/dss/dss.h" +#include "opal/util/output.h" +#include "opal/util/timings.h" +#include "opal/class/opal_list.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/util/name_fns.h" + +#include "orte/mca/qos/qos.h" +#include "orte/mca/qos/base/base.h" +#include "orte/mca/rml/base/base.h" + + +int orte_qos_base_pack_attributes (opal_buffer_t * buffer, + opal_list_t * qos_attributes) +{ + int32_t num_attributes; + int32_t rc= ORTE_SUCCESS; + orte_attribute_t *kv; + num_attributes = opal_list_get_size (qos_attributes); + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s orte_qos_base_pack_attributes num_attributes = %d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + num_attributes)); + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, (void*)(&num_attributes), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG (rc); + return rc; + } + OPAL_LIST_FOREACH(kv, qos_attributes, orte_attribute_t) { + if (ORTE_ATTR_GLOBAL == kv->local) { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s orte_qos_base_pack_attributes attribute key = %d value =%d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + kv->key, kv->data.uint8)); + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, (void*)&kv, 1, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + return rc; +} + +void* orte_qos_get_module (opal_list_t *qos_attributes) +{ + int32_t * type, type_val =0; + mca_qos_base_component_t *qos_comp; + type = &type_val; + if(!orte_get_attribute( qos_attributes, ORTE_QOS_TYPE, (void**)&type, OPAL_UINT8)) + return NULL; + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s orte_qos_get_module channel type = %d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + type_val)); + //check if type is valid + if (type_val < 0 || ORTE_QOS_MAX_COMPONENTS <= type_val) + return NULL; + // associate the qos module + qos_comp = (mca_qos_base_component_t *) opal_pointer_array_get_item(&orte_qos_base.actives, type_val); + if (NULL != qos_comp) + { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s qos_base_get_module returning qos module %p type =%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)&qos_comp->mod, type_val)); + return (void*)(&qos_comp->mod); + } else { + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s qos_base_get_module failed to get qos component of type =%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + type_val)); + } + return NULL; +} + +void * orte_qos_create_channel (void *qos_mod, opal_list_t *qos_attributes, uint32_t channel_num) { + orte_qos_module_t *qos = (orte_qos_module_t *) (qos_mod); + if (NULL != qos) + return qos->create(qos_attributes, channel_num); + else + ORTE_ERROR_LOG (ORTE_ERR_BAD_PARAM); + return NULL; +} + +int orte_qos_open_channel (void *qos_mod, void *qos_channel, opal_buffer_t * buffer) { + orte_qos_module_t *qos = (orte_qos_module_t *) (qos_mod); + if (NULL != qos) + return (qos->open (qos_channel, buffer)); + else + ORTE_ERROR_LOG (ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; +} + +int orte_qos_close_channel (void *qos_mod, void *qos_channel) { + orte_qos_module_t *qos = (orte_qos_module_t *) (qos_mod); + if ((NULL != qos) && (NULL != qos_channel)) + return (qos->close (qos_channel)); + else + ORTE_ERROR_LOG (ORTE_ERR_BAD_PARAM); + return (ORTE_ERR_BAD_PARAM); +} + +void orte_qos_init_recv_channel (void *qos_mod, void *qos_channel, opal_list_t * qos_attributes) { + orte_qos_module_t *qos = (orte_qos_module_t *) (qos_mod); + if (NULL != qos) + qos->init_recv (qos_channel, qos_attributes); + else + ORTE_ERROR_LOG (ORTE_ERR_BAD_PARAM); +} + +int orte_qos_cmp_channel (void *qos_mod, void *qos_channel, opal_list_t * qos_attributes) { + orte_qos_module_t *qos = (orte_qos_module_t *) (qos_mod); + if (NULL != qos) + return (qos->cmp (qos_channel, qos_attributes)); + ORTE_ERROR_LOG (ORTE_ERR_BAD_PARAM); + return -1; +} + +int orte_qos_send_channel (void *qos_mod, void *qos_channel, orte_rml_send_t *msg) { + orte_qos_module_t *qos = (orte_qos_module_t *) (qos_mod); + if (NULL != qos) + return(qos->send (qos_channel, msg)); + else + ORTE_ERROR_LOG (ORTE_ERR_BAD_PARAM); + return ORTE_ERROR; +} + +int orte_qos_recv_channel (void *qos_mod, void *qos_channel, orte_rml_recv_t *msg) { + orte_qos_module_t *qos = (orte_qos_module_t *) (qos_mod); + if (NULL != qos) + return(qos->recv(qos_channel, msg)); + else { + ORTE_ERROR_LOG (ORTE_ERR_BAD_PARAM); + return ORTE_ERROR; + } +} + + diff --git a/orte/mca/qos/base/qos_base_frame.c b/orte/mca/qos/base/qos_base_frame.c new file mode 100644 index 00000000000..958f73c3fc4 --- /dev/null +++ b/orte/mca/qos/base/qos_base_frame.c @@ -0,0 +1,118 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "opal/class/opal_bitmap.h" +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "orte/mca/rml/base/base.h" +#include "orte/mca/qos/base/base.h" +#include "orte/mca/qos/qos.h" +#if OPAL_ENABLE_FT_CR == 1 +#include "orte/mca/state/state.h" +#endif + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "orte/mca/qos/base/static-components.h" + +/* + * Global variables + */ +orte_qos_base_t orte_qos_base; +OPAL_TIMING_DECLARE(tm_qos) + +static int orte_qos_base_register(mca_base_register_flag_t flags) +{ +#if OPAL_ENABLE_TIMING + /* Detailed timing setup */ + orte_qos_base.timing = false; + (void) mca_base_var_register ("orte", "qos", "base", "timing", + "Enable QOS timings", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &orte_qos_base.timing); +#endif + return ORTE_SUCCESS; +} + +static int orte_qos_base_close(void) +{ + + + /* shutdown all active transports */ + /*while (NULL != (cli = (mca_base_component_list_item_t *) opal_list_remove_first (&orte_qos_base.actives))) { + component = (mca_qos_base_component_t*)cli->cli_component; + if (NULL != component->shutdown) { + component->shutdown(); + } + OBJ_RELEASE(cli); + }*/ + // TO DO + + /* destruct our internal lists */ + OBJ_DESTRUCT(&orte_qos_base.actives); + OPAL_TIMING_EVENT((&tm_qos, "Finish")); + OPAL_TIMING_REPORT(orte_qos_base.timing, &tm_qos); + + return mca_base_framework_components_close(&orte_qos_base_framework, NULL); +} + +/** + * Function for finding and opening either all MCA components, + * or the one that was specifically requested via a MCA parameter. + */ +static int orte_qos_base_open(mca_base_open_flag_t flags) +{ + /* setup globals */ + OBJ_CONSTRUCT(&orte_qos_base.actives, opal_pointer_array_t); + opal_pointer_array_init(&orte_qos_base.actives, ORTE_QOS_MAX_COMPONENTS, INT_MAX, 1); + +/* +#if OPAL_ENABLE_FT_CR == 1 + + orte_state.add_job_state(ORTE_JOB_STATE_FT_CHECKPOINT, orte_qos_base_ft_event, ORTE_ERROR_PRI); + orte_state.add_job_state(ORTE_JOB_STATE_FT_CONTINUE, orte_qos_base_ft_event, ORTE_ERROR_PRI); + orte_state.add_job_state(ORTE_JOB_STATE_FT_RESTART, orte_qos_base_ft_event, ORTE_ERROR_PRI); +#endif*/ + + OPAL_TIMING_INIT(&tm_qos); + + /* Open up all available components */ + return mca_base_framework_components_open(&orte_qos_base_framework, flags); +} + +MCA_BASE_FRAMEWORK_DECLARE(orte, qos, "Messaging Quality of Service Subsystem", + orte_qos_base_register, orte_qos_base_open, orte_qos_base_close, + mca_qos_base_static_components, 0); + +/*** QOS CLASS INSTANCES ***/ + +static void channel_cons (orte_qos_base_channel_t *ptr) +{ + OBJ_CONSTRUCT(&ptr->attributes, opal_list_t); +} +static void channel_des (orte_qos_base_channel_t *ptr) +{ + OPAL_LIST_DESTRUCT(&ptr->attributes); +} +OBJ_CLASS_INSTANCE (orte_qos_base_channel_t, + opal_list_item_t, + channel_cons, channel_des); + + diff --git a/orte/mca/qos/base/qos_base_select.c b/orte/mca/qos/base/qos_base_select.c new file mode 100644 index 00000000000..26fe71bfcac --- /dev/null +++ b/orte/mca/qos/base/qos_base_select.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2014 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "orte/util/show_help.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/mca/qos/qos.h" +#include "orte/mca/qos/base/base.h" + + +/** + * Function for selecting all runnable modules from those that are + * available. + * + * Call the init function on all available modules. + */ +int orte_qos_base_select(void) +{ + mca_base_component_list_item_t *cli; + mca_qos_base_component_t *component; + int count = 0; + + /* Query all available components and ask if their transport is available */ + OPAL_LIST_FOREACH(cli, &orte_qos_base_framework.framework_components, mca_base_component_list_item_t) { + component = (mca_qos_base_component_t *) cli->cli_component; + + opal_output_verbose(5, orte_qos_base_framework.framework_output, + "mca:qos:select: checking available component %s", + component->qos_base.mca_component_name); + if (NULL == component->start ) + opal_output_verbose(5, orte_qos_base_framework.framework_output, + "mca:qos:select: component %s start function is null, type =%d", + component->qos_base.mca_component_name, component->type); + else { + /* if it fails to startup, then skip it */ + if (ORTE_SUCCESS != component->start()) { + opal_output_verbose(5, orte_qos_base_framework.framework_output, + "mca:qos:select: Skipping component [%s] - failed to initialize", + component->qos_base.mca_component_name ); + continue; + } + } + count++; + /* store each qos componenet in the actives pointer array at the index of that component type */ + opal_pointer_array_set_item(&orte_qos_base.actives, + component->type, component); + } + + if (0 == count) { + /* no support available means we really cannot run */ + opal_output_verbose(5, orte_qos_base_framework.framework_output, + "mca:qos:select: Init failed to return any available QoS components"); + orte_show_help("help-qos-base.txt", "no-interfaces-avail", true); + return ORTE_ERR_SILENT; + } + opal_output_verbose(5, orte_qos_base_framework.framework_output, + "mca:qos:select: Found %d active QoS components", + count); + return ORTE_SUCCESS; +} diff --git a/orte/mca/qos/noop/Makefile.am b/orte/mca/qos/noop/Makefile.am new file mode 100644 index 00000000000..c4585de201c --- /dev/null +++ b/orte/mca/qos/noop/Makefile.am @@ -0,0 +1,34 @@ +# +# Copyright (c) 2014 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + qos_noop.h \ + qos_noop_component.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_qos_noop_DSO +component_noinst = +component_install = mca_qos_noop.la +else +component_noinst = libmca_qos_noop.la +component_install = +endif + +mcacomponentdir = $(ortelibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_qos_noop_la_SOURCES = $(sources) +mca_qos_noop_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_qos_noop_la_SOURCES = $(sources) +libmca_qos_noop_la_LDFLAGS = -module -avoid-version + diff --git a/orte/mca/qos/noop/qos_noop.h b/orte/mca/qos/noop/qos_noop.h new file mode 100644 index 00000000000..350d3110d7e --- /dev/null +++ b/orte/mca/qos/noop/qos_noop.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2014 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * QoS No-op Component interface + * + * + * + */ + +#ifndef MCA_QOS_NOOP_H +#define MCA_QOS_NOOP_H + +#include "orte_config.h" +#include "orte/mca/qos/qos.h" +#include "orte/mca/qos/base/base.h" + +BEGIN_C_DECLS + + +ORTE_MODULE_DECLSPEC extern orte_qos_component_t mca_qos_noop_component; + +extern orte_qos_module_t orte_qos_noop_module; + +END_C_DECLS + +#endif /* MCA_QOS_NOOP_H */ diff --git a/orte/mca/qos/noop/qos_noop_channel_handlers.c b/orte/mca/qos/noop/qos_noop_channel_handlers.c new file mode 100644 index 00000000000..5083ab48d15 --- /dev/null +++ b/orte/mca/qos/noop/qos_noop_channel_handlers.c @@ -0,0 +1,339 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * qos_base_channel_handlers.c - contains base functions handlers for open, send and close channel requests. + */ + +/* + * includes + */ +#include "orte_config.h" + +#ifdef HAVE_STRING_H +#include +#endif + +#include "orte/constants.h" +#include "orte/types.h" + +#include "opal/dss/dss.h" +#include "opal/util/output.h" +#include "opal/util/timings.h" +#include "opal/class/opal_list.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/util/name_fns.h" + +#include "orte/mca/qos/qos.h" +#include "orte/mca/qos/base/base.h" + + +static int orte_qos_base_pack_attributes (opal_buffer_t * buffer, + opal_list_t * qos_attributes) +{ + int32_t num_attributes; + int32_t rc= ORTE_SUCCESS; + orte_attribute_t *kv; + num_attributes = opal_list_get_size (qos_attributes); + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, (void*)(&num_attributes), 1, ORTE_STD_CNTR))) { + ORTE_LOG_ERROR (rc); + return rc; + } + OPAL_LIST_FOREACH(kv, qos_attributes, orte_attribute_t) { + if (ORTE_ATTR_GLOBAL == kv->local) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, (void*)&kv, 1, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + return rc; +} + +static int orte_qos_base_unpack_attributes (opal_buffer_t *buffer, + opal_list_t *qos_attributes) +{ + orte_attribute_t *kv; + int32_t count, n, k; + int32_t rc=ORTE_SUCCESS; + /* unpack the attributes */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &count, + &n, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + for (k=0; k < count; k++) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &kv, + &n, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_list_append(qos_attributes, &kv->super); + } + return rc; +} + +void* orte_qos_base_create_channel ( orte_rml_channel_t *channel, + opal_list_t *qos_attributes) +{ + int32_t * type, type_val; + mca_qos_base_component_t *qos_comp; + if(!orte_get_attribute( qos_attributes, ORTE_QOS_TYPE, (void**)&type, OPAL_UINT8)) + return NULL; + type_val = *type; + //check if type is valid + if (0 < type_val || ORTE_QOS_MAX_COMPONENTS <= type_val) + return NULL; + // associate the qos module + qos_comp = (mca_qos_base_component_t *) opal_pointer_array_get_item(&orte_qos_base.actives, type_val); + channel->qos = (void*) &qos_comp->mod; + // call create channel function of the module. + return (qos_comp->mod.create( qos_attributes)); +} + +void * orte_qos_base_create (opal_list_t *qos_attributes) +{ + orte_qos_base_channel_t * base_chan; + int32_t num_attributes; + int32_t rc, *window; + orte_qos_type_t *type; + orte_attribute_t *kv; + base_chan = OBJ_NEW (orte_qos_base_channel_t); + *type = orte_qos_noop; + // TBD _ we ignore inapplicable attributes for now - need to return error? + // get attributes of interest to the base and store them locally. + if (ORTE_SUCCESS == (rc = orte_set_attribute( &base_chan->attributes, ORTE_QOS_TYPE, ORTE_ATTR_GLOBAL, (void*)type, OPAL_UINT8))) { + // window size?? + if( orte_get_attribute (qos_attributes, ORTE_QOS_WINDOW_SIZE, (void**)&window, OPAL_UINT32)) { + if ( ORTE_QOS_MAX_WINDOW_SIZE > (*window)) { + ORTE_ERROR_LOG(OPAL_ERR_VALUE_OUT_OF_BOUNDS); + OBJ_RELEASE(base_chan); + } + else { + if (ORTE_SUCCESS != (rc = orte_set_attribute(&base_chan->attributes, ORTE_QOS_WINDOW_SIZE, + ORTE_ATTR_GLOBAL, (void*)window, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(base_chan); + } + } + } else + OBJ_RELEASE(base_chan); + } else { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(base_chan); + } + return base_chan; +} + +int orte_qos_base_open_channel ( void * qos_channel, + opal_buffer_t *buffer) +{ + int32_t rc = ORTE_SUCCESS; + orte_qos_base_channel_t *base_chan; + base_chan = (orte_qos_base_channel_t*) (qos_channel); + // the Qos module puts the non local attributes to be sent to the peer in a list at the time of create. + // pack those attributes into the buffer. + if (ORTE_SUCCESS != (rc = orte_qos_base_pack_attributes(buffer, &base_chan->attributes))) + ORTE_ERROR_LOG(rc); + return rc; +} + +void orte_qos_base_chan_recv_init ( void * qos_channel, + opal_list_t *qos_attributes) +{ + // nothing to do for no op channel. +} + +void orte_qos_base_close_channel ( void * qos_channel) +{ + qos_channel = (orte_qos_base_channel_t*) (qos_channel); + OBJ_RELEASE(qos_channel); +} + +int orte_qos_base_comp_channel (void *qos_channel, + opal_list_t *qos_attributes) +{ + int32_t chan_typea, chan_typeb, *ptr, window_sizea, window_sizeb; + orte_qos_base_channel_t *base_chan = (orte_qos_base_channel_t*) qos_channel; + ptr = &chan_typea; + if (!orte_get_attribute(&base_chan->attributes, ORTE_QOS_TYPE, (void**)&ptr, OPAL_UINT8)) + return ORTE_ERROR; + ptr = &chan_typeb; + if (!orte_get_attribute(qos_attributes, ORTE_QOS_TYPE, (void**)&ptr, OPAL_UINT8)) + return ORTE_ERROR; + if (chan_typea == chan_typeb) { + ptr = &window_sizea; + if (!orte_get_attribute(&base_chan->attributes, ORTE_QOS_WINDOW_SIZE, (void**)&ptr, OPAL_UINT32)) + return ORTE_ERROR; + ptr = &window_sizeb; + if (!orte_get_attribute(qos_attributes, ORTE_QOS_WINDOW_SIZE, (void**)&ptr, OPAL_UINT32)) + return ORTE_ERROR; + return (window_sizea != window_sizeb); + } + else + return ORTE_ERROR; +} +/*static void orte_qos_open_channel_reply_send_callback ( int status, + orte_process_name_t* sender, + opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + // this is the send call back for open channel reply + orte_qos_channel_t *channel = (orte_qos_channel_t*) cbdata; + // if the message was not sent we should retry or complete the request appropriately + if (status!= ORTE_SUCCESS) + { + //retry request. + } + // if success then release the buffer and do open channel request completion after receiving response from peer + OBJ_RELEASE(buffer); +} + +static void orte_qos_open_channel_send_callback ( int status, + orte_process_name_t* sender, + opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + // this is the send call back for open channel request + orte_qos_open_channel_t *req = (orte_qos_open_channel_t*) cbdata; + // if the message was not sent we should retry or complete the request appropriately + if (status!= ORTE_SUCCESS) + { + // retry if retriable failure. + // else call completion handler. + //remove channel from list + opal_list_remove_item(&orte_qos_base.open_channels, &req->channel->super); + OBJ_RELEASE(req->channel); + // update msg status and channel num so end point can have appropriate info + req->msg->status = status; + req->msg->channel_num = ORTE_QOS_INVALID_CHANNEL_NUM; + ORTE_RML_OPEN_CHANNEL_COMPLETE(req->msg); + OBJ_RELEASE(req); + } + // if success then release the buffer and do open channel request completion after receiving response from peer + OBJ_RELEASE(buffer); +} + +void orte_qos_base_open_channel(int sd, short args, void *cbdata) +{ + opal_buffer_t *buffer; int rc; + orte_qos_open_channel_t *open_channel; + orte_qos_open_channel_request_t *req = (orte_qos_open_channel_request_t*)cbdata; + // create channel on sender side by calling the respective qos module. + req->post.channel = orte_qos_base_create_channel(req->post.msg->dst, req->post.msg->qos_attributes); + buffer = OBJ_NEW(opal_buffer_t); + //pack qos attributes list in buffer + if (ORTE_SUCCESS != orte_qos_base_pack_attributes(buffer, req->post.msg->qos_attributes)) { + //invalid attributes complete request with error + } + open_channel = OBJ_NEW(orte_qos_open_channel_t); + open_channel->msg = req->post.msg; + open_channel->channel = req->post.channel; + open_channel->msg->channel_num = open_channel->channel->channel_num; + OBJ_RELEASE(req); + // send request to peer to open channel + orte_rml.send_buffer_nb( &open_channel->msg->dst, buffer, ORTE_RML_TAG_OPEN_CHANNEL_REQ, + orte_qos_open_channel_send_callback, + open_channel); + // now post a recieve for open_channel_response tag + orte_rml.recv_buffer_nb(&open_channel->msg->dst, ORTE_RML_TAG_OPEN_CHANNEL_REPLY, + ORTE_RML_NON_PERSISTENT, orte_qos_open_channel_reply_callback, open_channel); + +} */ + + +/* +void orte_qos_open_channel_recv_callback (int status, + orte_process_name_t* peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + int32_t rc; + opal_list_t *qos_attributes = OBJ_NEW(opal_list_t); + orte_qos_channel_t *channel; + // un pack attributes first + if ( ORTE_SUCCESS == orte_qos_base_unpack_attributes( buffer, qos_attributes)) { + // create channel + if (NULL != (channel = orte_qos_base_create_channel ( *peer, qos_attributes)) ) { + buffer = OBJ_NEW (opal_buffer_t); + if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, &channel->channel_num , 1, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + return; + } + // send channel accept to sender with local channel num + orte_rml.send_buffer_nb ( peer, buffer, ORTE_RML_TAG_OPEN_CHANNEL_REPLY, + orte_qos_open_channel_reply_send_callback, + channel); + } + else { + // reply with error message + } + } + else { + //reply with error message + } +} + +void orte_qos_open_channel_reply_callback (int status, + orte_process_name_t* peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + orte_qos_open_channel_t *req = (orte_qos_open_channel_t*) cbdata; + orte_qos_channel_t * channel = req->channel; + int32_t count = 1; + int32_t rc; + // process open_channel response from a peer for a open channel request + if (ORTE_SUCCESS == status) { + // unpack buffer and get peer channel number. + + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &channel->peer_channel_num, &count, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + // do error completion + channel->state = orte_qos_channel_closed; + //remove channel from list + opal_list_remove_item(&orte_qos_base.open_channels, &channel->super); + OBJ_RELEASE(channel); + // update msg status and channel num so end point can have appropriate info + req->msg->status = ORTE_ERR_OPEN_CHANNEL_PEER_RESPONSE_INV; + req->msg->channel_num = ORTE_QOS_INVALID_CHANNEL_NUM; + } + else { + channel->state = orte_qos_channel_open; + req->msg->status = ORTE_SUCCESS; + req->msg->channel_num = channel->channel_num; + } + } + else { + channel->state = orte_qos_channel_closed; + //remove channel from list + opal_list_remove_item(&orte_qos_base.open_channels, &channel->super); + OBJ_RELEASE(channel); + // update msg status and channel num so end point can have appropriate info + req->msg->status = ORTE_ERR_OPEN_CHANNEL_PEER_FAIL; + req->msg->channel_num = ORTE_QOS_INVALID_CHANNEL_NUM; + } + ORTE_RML_OPEN_CHANNEL_COMPLETE(req->msg); + OBJ_RELEASE(req); + OBJ_RELEASE(buffer); + // 1: If success record peer channel number, update channel state. + //2: If not destroy channel. + //3: complete openchannel request. +} */ + + diff --git a/orte/mca/qos/noop/qos_noop_component.c b/orte/mca/qos/noop/qos_noop_component.c new file mode 100644 index 00000000000..638411b4729 --- /dev/null +++ b/orte/mca/qos/noop/qos_noop_component.c @@ -0,0 +1,198 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + + +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + + +#include "orte/mca/qos/base/base.h" +#include "orte/mca/qos/qos.h" + +static int qos_noop_start (void); +static void qos_noop_shutdown (void); +static void* noop_create (opal_list_t *qos_attributes, uint32_t channel_num); +static int noop_open (void *qos_channel, + opal_buffer_t * buf); +static int noop_send ( void *qos_channel, orte_rml_send_t *msg); +static int noop_recv (void *channel, orte_rml_recv_t *msg); +static int noop_close (void * channel); +static int noop_init_recv (void *channel, opal_list_t *attributes); +static int noop_cmp (void *channel, opal_list_t *attributes); +static void noop_send_callback (orte_rml_send_t *msg); + +/** + * noop module definition + */ +orte_qos_module_t orte_qos_noop_module = { + noop_create, + noop_open, + noop_send, + noop_recv, + noop_close, + noop_init_recv, + noop_cmp, + noop_send_callback +}; + +/** + * component definition + */ +mca_qos_base_component_t mca_qos_noop_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_QOS_BASE_VERSION_2_0_0, + + "noop", /* MCA component name */ + ORTE_MAJOR_VERSION, /* MCA component major version */ + ORTE_MINOR_VERSION, /* MCA component minor version */ + ORTE_RELEASE_VERSION, /* MCA component release version */ + NULL, + NULL, + }, + qos_noop_start, + qos_noop_shutdown, + orte_qos_noop, + { + noop_create, + noop_open, + noop_send, + noop_recv, + noop_close, + noop_init_recv, + noop_cmp, + noop_send_callback + } +}; + +static int qos_noop_start(void) { + return ORTE_SUCCESS; +} + +static void qos_noop_shutdown (void) { +} + +static void* noop_create (opal_list_t *qos_attributes, uint32_t channel_num) { + orte_qos_base_channel_t * noop_chan; + int32_t rc, *window, window_val; + orte_qos_type_t type_val = orte_qos_noop; + orte_qos_type_t *type; + + noop_chan = OBJ_NEW (orte_qos_base_channel_t); + noop_chan->channel_num = channel_num; + type = &type_val; + window = &window_val; + // TBD _ we ignore inapplicable attributes for now - need to return error? + // get attributes of interest to the base and store them locally. + if (ORTE_SUCCESS == (rc = orte_set_attribute( &noop_chan->attributes, ORTE_QOS_TYPE, ORTE_ATTR_GLOBAL, (void*)type, OPAL_UINT8))) { + // window size?? + if( orte_get_attribute (qos_attributes, ORTE_QOS_WINDOW_SIZE, (void**)&window, OPAL_UINT32)) { + if ( ORTE_QOS_MAX_WINDOW_SIZE < (*window)) { + ORTE_ERROR_LOG(OPAL_ERR_VALUE_OUT_OF_BOUNDS); + OBJ_RELEASE(noop_chan); + } + else { + if (ORTE_SUCCESS != (rc = orte_set_attribute(&noop_chan->attributes, ORTE_QOS_WINDOW_SIZE, + ORTE_ATTR_GLOBAL, (void*)window, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(noop_chan); + } + } + }else + OBJ_RELEASE(noop_chan); + } else { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(noop_chan); + } + return noop_chan; +} + +static int noop_open (void *qos_channel, opal_buffer_t * buf) +{ + int32_t rc = ORTE_SUCCESS; + orte_qos_base_channel_t *noop_chan; + noop_chan = (orte_qos_base_channel_t*) (qos_channel); + // the Qos module puts the non local attributes to be sent to the peer in a list at the time of create. + // pack those attributes into the buffer. + if (ORTE_SUCCESS != (rc = orte_qos_base_pack_attributes(buf, &noop_chan->attributes))) + ORTE_ERROR_LOG(rc); + return rc; +} + +static int noop_send ( void *qos_channel, orte_rml_send_t *msg) +{ + //nothing to do + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s noop_send msg = %p to peer = %s\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, ORTE_NAME_PRINT(&msg->dst))); + return ORTE_SUCCESS; +} + +static int noop_recv (void *qos_channel, orte_rml_recv_t *msg) +{ + OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, + "%s noop_recv msg = %p from peer = %s\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (void*)msg, ORTE_NAME_PRINT(&msg->sender))); + return ORTE_SUCCESS; +} + +static int noop_close (void * channel) +{ + orte_qos_base_channel_t *noop_chan; + if(NULL != channel) { + noop_chan = (orte_qos_base_channel_t*) channel; + OBJ_RELEASE (noop_chan); + return ORTE_SUCCESS; + } else + return ORTE_ERR_BAD_PARAM; + +} + +static int noop_init_recv (void *channel, opal_list_t *attributes) +{ + return ORTE_SUCCESS; +} + +static int noop_cmp (void *channel, opal_list_t *attributes) +{ + int32_t chan_typea, chan_typeb, *ptr, window_sizea, window_sizeb; + orte_qos_base_channel_t *noop_chan = (orte_qos_base_channel_t*) channel; + ptr = &chan_typea; + if (!orte_get_attribute(&noop_chan->attributes, ORTE_QOS_TYPE, (void**)&ptr, OPAL_UINT8)) + return ORTE_ERROR; + ptr = &chan_typeb; + if (!orte_get_attribute(attributes, ORTE_QOS_TYPE, (void**)&ptr, OPAL_UINT8)) + return ORTE_ERROR; + if (chan_typea == chan_typeb) { + ptr = &window_sizea; + if (!orte_get_attribute(&noop_chan->attributes, ORTE_QOS_WINDOW_SIZE, (void**)&ptr, OPAL_UINT32)) + return ORTE_ERROR; + ptr = &window_sizeb; + if (!orte_get_attribute(attributes, ORTE_QOS_WINDOW_SIZE, (void**)&ptr, OPAL_UINT32)) + return ORTE_ERROR; + return (window_sizea != window_sizeb); + } + else + return ORTE_ERROR; +} + +static void noop_send_callback (orte_rml_send_t *msg) +{ + // nothing to do for noop + ORTE_RML_SEND_COMPLETE(msg); +} diff --git a/orte/mca/qos/qos.h b/orte/mca/qos/qos.h new file mode 100644 index 00000000000..378a8b9d93c --- /dev/null +++ b/orte/mca/qos/qos.h @@ -0,0 +1,159 @@ +/** + * copyright (c) 2014 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * This header defines Quality of Service Interface for Runtime messaging + */ + +/** + * @file + * + * Quality of Service (QoS) Communication Interface + * + * The QoS layer is responsible for providing quality of service for + * messages exchanged between two ORTE processes through the use of + * channels. + */ +#ifndef MCA_QOS_H_ +#define MCA_QOS_H_ + +#include "orte_config.h" +#include "orte/types.h" + +#ifdef HAVE_UNISTD_H +#include +#endif + +#include "opal/class/opal_list.h" +#include "opal/mca/mca.h" +#include "orte/mca/rml/base/base.h" +#include "orte/mca/qos/base/base.h" +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS +/* ******************************************************************** */ +struct opal_buffer_t; +struct orte_process_name_t; + + +/* ******************************************************************** */ +#define ORTE_QOS_INVALID_CHANNEL_NUM 0xFFFF +#define ORTE_QOS_MAX_COMPONENTS 5 +typedef void (*orte_qos_callback_fn_t)(int status, + int channel_num, + struct orte_process_name_t* peer, + void* cbdata); + +typedef int (*mca_qos_base_component_start_fn_t)(void); +typedef void (*mca_qos_base_component_shutdown_fn_t)(void); + +#if OPAL_ENABLE_FT_CR == 1 +typedef int (*mca_qos_base_component_ft_event_fn_t)(int state); +#endif +ORTE_DECLSPEC void * orte_qos_create_channel (void *qos_mod, opal_list_t *qos_attributes, uint32_t channel_num); +ORTE_DECLSPEC int orte_qos_open_channel (void *qos_mod, void *qos_channel, opal_buffer_t * buffer); +ORTE_DECLSPEC int orte_qos_close_channel (void *qos_mod, void *qos_channel); +ORTE_DECLSPEC void orte_qos_init_recv_channel (void *qos_mod, void *qos_channel, opal_list_t *qos_attributes); +ORTE_DECLSPEC int orte_qos_cmp_channel (void *qos_mod, void *qos_channel, opal_list_t *qos_attributes); +ORTE_DECLSPEC int orte_qos_send_channel (void *qos_mod, void *qos_channel, orte_rml_send_t *msg); +ORTE_DECLSPEC int orte_qos_recv_channel (void *qos_mod, void *qos_channel, orte_rml_recv_t *msg); +/** + * qos module (channel) create function + * initialize type specific attributes of the channel. + */ +typedef void* (*orte_qos_base_module_create_fn_t) (opal_list_t *qos_attributes, uint32_t channel_num); + +/** + * qos module (channel) open function + * this function is called when rml_open_channel is requested + */ +typedef int (*orte_qos_base_module_open_fn_t) (void *qos_channel, + opal_buffer_t * buf); + +/** + * qos module (channel) send function + * this function is called when rml_send_channel is requested + */ +typedef int (*orte_qos_base_module_send_fn_t) ( void * qos_channel, + orte_rml_send_t *send); + +/** + * qos module (channel) recv function + * this function is called when a message is received on a channel + */ +typedef int (*orte_qos_base_module_recv_fn_t) ( void * channel, + orte_rml_recv_t *msg); +/** + * qos module (channel) close function + * this function is called when a message is received on a channel + */ + +typedef int (*orte_qos_base_module_close_fn_t) ( void * channel); +/** + * qos module (channel) init recv + * this function is used to initialize a channel for receiving msgs (called in response to open_channel req from peer) + */ +typedef int (*orte_qos_base_module_init_recv_fn_t) (void * channel, opal_list_t * attributes); + +/** + * qos module (channel) compare functions + * compares attributes of existing channel with the requested list of attributes + */ +typedef int (*orte_qos_base_module_cmp_fn_t) (void * channel, opal_list_t * attributes); + +/** + * qos module (channel) compare functions + * compares attributes of existing channel with the requested list of attributes + */ +typedef void (*orte_qos_base_module_send_callback_fn_t) (orte_rml_send_t *msg); + +/** + * + * the qos channel data structure + */ +typedef struct { + orte_qos_base_module_create_fn_t create; + orte_qos_base_module_open_fn_t open; + orte_qos_base_module_send_fn_t send; + orte_qos_base_module_recv_fn_t recv; + orte_qos_base_module_close_fn_t close; + orte_qos_base_module_init_recv_fn_t init_recv; + orte_qos_base_module_cmp_fn_t cmp; + orte_qos_base_module_send_callback_fn_t send_callback; +} orte_qos_module_t; + +typedef enum { + orte_qos_noop = 0, + orte_qos_ack = 1, + orte_qos_nack = 2, + orte_qos_ack_nack_hybrid = 3, + orte_qos_multipath = 4, +}orte_qos_type_t ; + +typedef struct { + mca_base_component_t qos_base; + mca_qos_base_component_start_fn_t start; + mca_qos_base_component_shutdown_fn_t shutdown; + orte_qos_type_t type; + orte_qos_module_t mod; +/* mca_qos_base_componenet_open_channel_fn_t open_channel; + mca_qos_base_component_send_channel_nb_fn_t send_channel; + mca_qos_base_component_recv_channel_nb_fn_t recv_channel; + mca_qos_base_component_close_channel_fn_t close_channel;*/ +#if OPAL_ENABLE_FT_CR == 1 + mca_qos_base_component_ft_event_fn_t ft_event; +#endif +} mca_qos_base_component_t; + +/** + * Macro for use in components that are of type oob + */ +#define MCA_QOS_BASE_VERSION_2_0_0 \ +ORTE_MCA_BASE_VERSION_2_1_0 ("qos", 2, 0, 0) + +END_C_DECLS + +#endif diff --git a/orte/mca/rml/base/Makefile.am b/orte/mca/rml/base/Makefile.am index dfd8df0a504..1461032070a 100644 --- a/orte/mca/rml/base/Makefile.am +++ b/orte/mca/rml/base/Makefile.am @@ -5,16 +5,16 @@ # Copyright (c) 2004-2005 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights -# reserved. +# reserved. # $COPYRIGHT$ -# +# # Additional copyrights may follow -# +# # $HEADER$ # @@ -26,4 +26,5 @@ libmca_rml_la_SOURCES += \ base/rml_base_frame.c \ base/rml_base_receive.c \ base/rml_base_contact.c \ - base/rml_base_msg_handlers.c + base/rml_base_msg_handlers.c \ + base/rml_base_channel_handlers.c diff --git a/orte/mca/rml/base/base.h b/orte/mca/rml/base/base.h index 4eca6e45e18..e4d9119ede8 100644 --- a/orte/mca/rml/base/base.h +++ b/orte/mca/rml/base/base.h @@ -6,17 +6,17 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014 -2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -26,7 +26,7 @@ * RML Framework maintenence interface * * Interface for starting / stopping / controlling the RML framework, - * as well as support for modifying RML datatypes. + * as well as support for modifying RML datatypes. * * @note The only RML datatype exposed to the user is the RML tag. * This will always be an integral value, so the only datatype support @@ -43,11 +43,13 @@ #include "opal/dss/dss_types.h" #include "orte/mca/mca.h" #include "opal/util/timings.h" +#include "opal/class/opal_pointer_array.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/rml/rml.h" + BEGIN_C_DECLS OPAL_TIMING_DECLARE_EXT(ORTE_DECLSPEC, tm_rml) @@ -84,6 +86,7 @@ ORTE_DECLSPEC void orte_rml_base_comm_stop(void); typedef struct { opal_list_t posted_recvs; opal_list_t unmatched_msgs; + opal_pointer_array_t open_channels; #if OPAL_ENABLE_TIMING bool timing; #endif @@ -109,12 +112,37 @@ ORTE_DECLSPEC extern opal_list_t orte_rml_base_components; * Component structure pointer for the currently selected RML * component. Useable between calls to orte_rml_base_select() and * orte_rml_base_close(). - * * @note This pointer should not be used outside the RML base. It is * available outside the RML base only for the F/T component. */ ORTE_DECLSPEC extern orte_rml_component_t *orte_rml_component; +typedef enum { + orte_rml_channel_opening = 0, + orte_rml_channel_open = 1, + orte_rml_channel_closing = 2, + orte_rml_channel_closed = 3, +}orte_rml_channel_state_t; + +/** + * RML channel structure. + * The RML only needs basic channel information as the rest of the book keeping information + * is stored in the QoS module specific channel object. + * It contains a pointer to the QoS module that handles requests on the channel. + * It contains a pointer to a struct that contains the QoS specific channel data. + */ +typedef struct { + opal_list_item_t super; + orte_rml_channel_num_t channel_num; // the channel number reference (exposed to the user). + orte_process_name_t peer; // the other end point (peer) of the channel + orte_rml_channel_num_t peer_channel; // peer channel number + void * qos; // pointer to QoS component specific module + void * qos_channel_ptr; // pointer to QoS component specific channel struct + orte_rml_channel_state_t state; // channel state + bool recv; // set to true if this is a receive (peer opened) channel. (Default is send channel) +} orte_rml_channel_t; +OBJ_CLASS_DECLARATION(orte_rml_channel_t); + /* structure to send RML messages - used internally */ typedef struct { @@ -128,6 +156,8 @@ typedef struct { union { orte_rml_callback_fn_t iov; orte_rml_buffer_callback_fn_t buffer; + orte_rml_send_channel_callback_fn_t iov_chan; + orte_rml_send_buffer_channel_callback_fn_t buf_chan; } cbfunc; void *cbdata; @@ -136,6 +166,13 @@ typedef struct { int count; /* pointer to the user's buffer */ opal_buffer_t *buffer; + /*** TODO : need to move channel specific data to a channel struct */ + /* pointer to the channel object */ + orte_rml_channel_t *channel; + /* destination channel number */ + orte_rml_channel_num_t dst_channel; + /* msg seq number */ + uint32_t seq_num; /* pointer to raw data for cross-transport * transfers */ @@ -143,11 +180,47 @@ typedef struct { } orte_rml_send_t; OBJ_CLASS_DECLARATION(orte_rml_send_t); +/* structure to send RML channel open messages - used internally */ +typedef struct { + opal_list_item_t super; + /* peer process */ + orte_process_name_t dst; + /* msg send status */ + int status; + /* channel object */ + orte_rml_channel_t *channel; + /* attributes of the channel */ + opal_list_t *qos_attributes; + /* user's callback function */ + orte_rml_channel_callback_fn_t cbfunc; + /* user's cbdata */ + void *cbdata; +} orte_rml_open_channel_t; +OBJ_CLASS_DECLARATION(orte_rml_open_channel_t); + +/* structure to send RML channel close messages - used internally */ +typedef struct { + opal_list_item_t super; + /* msg send status */ + int status; + /* channel object */ + orte_rml_channel_t *channel; + /* user's callback function */ + orte_rml_channel_callback_fn_t cbfunc; + /* user's cbdata */ + void *cbdata; +} orte_rml_close_channel_t; +OBJ_CLASS_DECLARATION(orte_rml_close_channel_t); + /* define an object for transferring send requests to the event lib */ typedef struct { opal_object_t super; opal_event_t ev; - orte_rml_send_t post; + union { + orte_rml_send_t send; + orte_rml_open_channel_t open_channel; + orte_rml_close_channel_t close_channel; + }post; } orte_rml_send_request_t; OBJ_CLASS_DECLARATION(orte_rml_send_request_t); @@ -157,6 +230,8 @@ typedef struct { opal_event_t ev; orte_process_name_t sender; // sender orte_rml_tag_t tag; // targeted tag + orte_rml_channel_num_t channel_num; // channel number + uint32_t seq_num; //sequence number struct iovec iov; // the recvd data } orte_rml_recv_t; OBJ_CLASS_DECLARATION(orte_rml_recv_t); @@ -184,7 +259,7 @@ typedef struct { } orte_rml_recv_request_t; OBJ_CLASS_DECLARATION(orte_rml_recv_request_t); -#define ORTE_RML_POST_MESSAGE(p, t, b, l) \ +#define ORTE_RML_POST_MESSAGE(p, t, c, s, b, l) \ do { \ orte_rml_recv_t *msg; \ opal_output_verbose(5, orte_rml_base_framework.framework_output, \ @@ -195,6 +270,8 @@ OBJ_CLASS_DECLARATION(orte_rml_recv_request_t); msg->sender.jobid = (p)->jobid; \ msg->sender.vpid = (p)->vpid; \ msg->tag = (t); \ + msg->channel_num = (c); \ + msg->seq_num = (s); \ msg->iov.iov_base = (IOVBASE_TYPE*)(b); \ msg->iov.iov_len = (l); \ /* setup the event */ \ @@ -215,6 +292,21 @@ OBJ_CLASS_DECLARATION(orte_rml_recv_request_t); opal_event_active(&(m)->ev, OPAL_EV_WRITE, 1); \ } while(0); +/* + reactivates rcv msg on the unposted rcvd list when a match occurs + need a different path as the QoS recv processing was already done + for this process +*/ +#define ORTE_RML_REACTIVATE_MESSAGE(m) \ + do { \ + /* setup the event */ \ + opal_event_set(orte_event_base, &(m)->ev, -1, \ + OPAL_EV_WRITE, \ + orte_rml_base_reprocess_msg, (m)); \ + opal_event_set_priority(&(m)->ev, ORTE_MSG_PRI); \ + opal_event_active(&(m)->ev, OPAL_EV_WRITE, 1); \ +} while(0); + #define ORTE_RML_SEND_COMPLETE(m) \ do { \ opal_output_verbose(5, orte_rml_base_framework.framework_output, \ @@ -222,34 +314,99 @@ OBJ_CLASS_DECLARATION(orte_rml_recv_request_t); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ ORTE_NAME_PRINT(&((m)->dst)), \ __FILE__, __LINE__); \ - if (NULL != (m)->iov) { \ - if (NULL != (m)->cbfunc.iov) { \ - (m)->cbfunc.iov((m)->status, \ + if( NULL == (m)->channel) { \ + if (NULL != (m)->iov) { \ + if (NULL != (m)->cbfunc.iov) { \ + (m)->cbfunc.iov((m)->status, \ &((m)->dst), \ (m)->iov, (m)->count, \ (m)->tag, (m)->cbdata); \ - } \ - } else { \ - /* non-blocking buffer send */ \ - (m)->cbfunc.buffer((m)->status, &((m)->origin), \ + } \ + } else { \ + /* non-blocking buffer send */ \ + (m)->cbfunc.buffer((m)->status, &((m)->origin), \ (m)->buffer, \ (m)->tag, (m)->cbdata); \ + } \ + } else { \ + if (NULL != (m)->iov) { \ + if (NULL != (m)->cbfunc.iov_chan) { \ + (m)->cbfunc.iov_chan((m)->status, \ + (m)->channel->channel_num, \ + (m)->iov, (m)->count, \ + (m)->tag, (m)->cbdata); \ + } \ + } else { \ + /* non-blocking buffer send */ \ + (m)->cbfunc.buf_chan((m)->status, \ + (m)->channel->channel_num, \ + (m)->buffer, \ + (m)->tag, (m)->cbdata); \ + } \ } \ OBJ_RELEASE(m); \ }while(0); + +#define ORTE_RML_OPEN_CHANNEL_COMPLETE(m) \ + do { \ + opal_output_verbose(5, orte_rml_base_framework.framework_output, \ + "%s-%s open channel message complete at %s:%d", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + ORTE_NAME_PRINT(&((m)->dst)), \ + __FILE__, __LINE__); \ + /* call the callback function */ \ + (m)->cbfunc((m)->status, (m)->channel->channel_num, \ + &((m)->dst), \ + NULL, (m)->cbdata) ; \ + }while(0); + +#define ORTE_RML_CLOSE_CHANNEL_COMPLETE(m) \ + do { \ + opal_output_verbose(5, orte_rml_base_framework.framework_output, \ + "%s-%d close channel message complete at %s:%d", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + (m)->channel->channel_num, \ + __FILE__, __LINE__); \ + /* call the callback function */ \ + (m)->cbfunc((m)->status, (m)->channel->channel_num, \ + NULL, NULL, (m)->cbdata) ; \ +}while(0); /* * This is the base priority for a RML wrapper component - * If there exists more than one wrapper, then the one with + * If there exists more than one wrapper, then the one with * the lowest priority wins. */ #define RML_SELECT_WRAPPER_PRIORITY -128 +#define ORTE_RML_INVALID_CHANNEL_NUM 1599 +ORTE_DECLSPEC orte_rml_channel_t * orte_rml_base_get_channel (orte_rml_channel_num_t chan_num); + + /* common implementations */ ORTE_DECLSPEC void orte_rml_base_post_recv(int sd, short args, void *cbdata); ORTE_DECLSPEC void orte_rml_base_process_msg(int fd, short flags, void *cbdata); ORTE_DECLSPEC void orte_rml_base_process_error(int fd, short flags, void *cbdata); - +ORTE_DECLSPEC void orte_rml_base_open_channel(int fd, short flags, void *cbdata); +ORTE_DECLSPEC void orte_rml_base_close_channel(int fd, short flags, void *cbdata); +ORTE_DECLSPEC void orte_rml_base_open_channel_send_callback ( int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +ORTE_DECLSPEC void orte_rml_base_open_channel_resp_callback (int status, orte_process_name_t* peer, + struct opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +ORTE_DECLSPEC void orte_rml_base_open_channel_reply_send_callback ( int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); +ORTE_DECLSPEC void orte_rml_base_prep_send_channel (orte_rml_channel_t *channel, + orte_rml_send_t *send); +ORTE_DECLSPEC int orte_rml_base_process_recv_channel (orte_rml_channel_t *channel, + orte_rml_recv_t *recv); +ORTE_DECLSPEC void orte_rml_base_close_channel_send_callback ( int status, orte_process_name_t* sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); +ORTE_DECLSPEC void orte_rml_base_send_close_channel ( orte_rml_close_channel_t *close_chan); +ORTE_DECLSPEC void orte_rml_base_reprocess_msg(int fd, short flags, void *cbdata); +ORTE_DECLSPEC void orte_rml_base_complete_recv_msg (orte_rml_recv_t **recv_msg); END_C_DECLS #endif /* MCA_RML_BASE_H */ diff --git a/orte/mca/rml/base/rml_base_channel_handlers.c b/orte/mca/rml/base/rml_base_channel_handlers.c new file mode 100644 index 00000000000..cd5112106d6 --- /dev/null +++ b/orte/mca/rml/base/rml_base_channel_handlers.c @@ -0,0 +1,540 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * + * Copyright (c) 2015 Intel, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + */ + +/* + * includes + */ +#include "orte_config.h" + +#ifdef HAVE_STRING_H +#include +#endif + +#include "orte/constants.h" +#include "orte/types.h" + +#include "opal/dss/dss.h" +#include "opal/util/output.h" +#include "opal/util/timings.h" +#include "opal/class/opal_list.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/util/name_fns.h" + +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/base/base.h" +#include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/qos/base/base.h" + + +static int unpack_channel_attributes (opal_buffer_t *buffer, opal_list_t *qos_attributes); +static orte_rml_channel_t * get_channel ( orte_process_name_t * peer, + opal_list_t *qos_attributes, + bool recv); +static int send_open_channel_reply (orte_process_name_t *peer, + orte_rml_channel_t *channel, + bool accept); +void orte_rml_base_close_channel(int fd, short flags, void *cbdata) +{ + orte_rml_send_request_t *req = (orte_rml_send_request_t*)cbdata; + orte_rml_close_channel_t *close_chan; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_close_channel to peer %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&req->post.close_channel.channel->peer))); + OPAL_TIMING_EVENT((&tm_rml, "to %s", ORTE_NAME_PRINT(&req->post.close_channel.channel->peer))); + close_chan = OBJ_NEW(orte_rml_close_channel_t); + close_chan->channel = req->post.close_channel.channel; + close_chan->cbfunc = req->post.close_channel.cbfunc; + close_chan->cbdata = req->post.close_channel.cbdata; + OBJ_RELEASE(req); + /* check with qos if the channel ready to be closed */ + if (ORTE_SUCCESS == orte_qos_close_channel (close_chan->channel->qos, + close_chan->channel->qos_channel_ptr)) { + orte_rml_base_send_close_channel( close_chan); + } + /* complete close request with error channel busy */ + else { + close_chan->status = ORTE_ERR_CHANNEL_BUSY; + ORTE_RML_CLOSE_CHANNEL_COMPLETE(close_chan); + OBJ_RELEASE(close_chan); + } +} + +void orte_rml_base_send_close_channel ( orte_rml_close_channel_t *close_chan) +{ + opal_buffer_t *buffer; + // send msg to peer to close channel. + buffer = OBJ_NEW (opal_buffer_t); + /* pack the channel number*/ + opal_dss.pack(buffer, &close_chan->channel->peer_channel, 1, OPAL_UINT32); + orte_rml.send_buffer_nb( &close_chan->channel->peer, buffer, ORTE_RML_TAG_CLOSE_CHANNEL_REQ, + orte_rml_base_close_channel_send_callback, + close_chan); +} + +void orte_rml_base_close_channel_send_callback ( int status, + orte_process_name_t* sender, + opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + // this is the send call back for open channel request + orte_rml_close_channel_t *req = (orte_rml_close_channel_t*) cbdata; + orte_process_name_t peer = req->channel->peer; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_close_channel_send_callback to peer %s status = %d", + ORTE_NAME_PRINT(sender), + ORTE_NAME_PRINT(&peer), status)); + req->status = status; + // if the message could not be sent log error + if (ORTE_SUCCESS != req->status) + ORTE_ERROR_LOG (req->status); + //complete the req. + ORTE_RML_CLOSE_CHANNEL_COMPLETE(req); + opal_pointer_array_set_item ( &orte_rml_base.open_channels, req->channel->channel_num, NULL); + // release the channel object and the req. + OBJ_RELEASE(req->channel); + OBJ_RELEASE(req); + OBJ_RELEASE(buffer); +} + +void orte_rml_base_open_channel(int fd, short flags, void *cbdata) +{ + int32_t *type, type_val; + orte_rml_send_request_t *req = (orte_rml_send_request_t*)cbdata; + orte_process_name_t peer; + orte_rml_open_channel_t *open_chan; + orte_rml_channel_t *channel; + opal_buffer_t *buffer; + peer = req->post.open_channel.dst; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel to peer %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&peer))); + OPAL_TIMING_EVENT((&tm_rml, "to %s", ORTE_NAME_PRINT(&peer))); + /* return error if a channel already exists */ + if ( NULL != (channel = get_channel (&peer, req->post.open_channel.qos_attributes, false))) + { + req->post.open_channel.status = ORTE_ERR_OPEN_CHANNEL_DUPLICATE; + req->post.open_channel.channel = channel; + ORTE_RML_OPEN_CHANNEL_COMPLETE(&req->post.open_channel); + OBJ_RELEASE(req); + return; + } + channel = OBJ_NEW(orte_rml_channel_t); + channel->channel_num = opal_pointer_array_add (&orte_rml_base.open_channels, channel); + channel->peer = peer; + open_chan = OBJ_NEW(orte_rml_open_channel_t); + open_chan->dst = peer; + open_chan->qos_attributes = req->post.open_channel.qos_attributes; + open_chan->cbfunc = req->post.open_channel.cbfunc; + open_chan->cbdata = req->post.open_channel.cbdata; + OBJ_RELEASE(req); + // associate open channel request and the newly created channel object + open_chan->channel = channel; + type = &type_val; + orte_get_attribute( open_chan->qos_attributes, ORTE_QOS_TYPE, (void**)&type, OPAL_UINT8); + open_chan->channel->qos = (void*) orte_qos_get_module (open_chan->qos_attributes); + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel type = %d to peer %s ", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + *type, + ORTE_NAME_PRINT(&peer))); + // now associate qos with the channel based on user requested attributes. + if ( NULL != open_chan->channel->qos) + { + open_chan->channel->qos_channel_ptr = orte_qos_create_channel (open_chan->channel->qos, + open_chan->qos_attributes, + open_chan->channel->channel_num); + // create rml send for open channel request. Call the corresponding QoS module to pack the attributes. + buffer = OBJ_NEW (opal_buffer_t); + // call QoS module to pack attributes + if ( ORTE_SUCCESS == (orte_qos_open_channel(open_chan->channel->qos, open_chan->channel->qos_channel_ptr, buffer))) + { + /* pack channel number at the end */ + opal_dss.pack(buffer, (void*) &open_chan->channel->channel_num, 1, OPAL_UINT32); + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel to peer %s SUCCESS sending to peer", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&peer))); + // post a recieve for open_channel_response tag + orte_rml.recv_buffer_nb(&peer, ORTE_RML_TAG_OPEN_CHANNEL_RESP, + ORTE_RML_NON_PERSISTENT, orte_rml_base_open_channel_resp_callback, open_chan); + // send request to peer to open channel + orte_rml.send_buffer_nb( &peer, buffer, ORTE_RML_TAG_OPEN_CHANNEL_REQ, + orte_rml_base_open_channel_send_callback, + open_chan); + + } else { + open_chan->status = ORTE_ERR_PACK_FAILURE; + ORTE_RML_OPEN_CHANNEL_COMPLETE(open_chan); + opal_pointer_array_set_item ( &orte_rml_base.open_channels, open_chan->channel->channel_num, NULL); + // call QoS module to release the QoS channel object. + orte_qos_close_channel (open_chan->channel->qos, open_chan->channel->qos_channel_ptr); + OBJ_RELEASE (buffer); + OBJ_RELEASE(open_chan->channel); + OBJ_RELEASE(open_chan); + } + } + else + { + // do error completion because a component for the requested QoS does not exist + open_chan->status = ORTE_ERR_QOS_TYPE_UNSUPPORTED; + ORTE_RML_OPEN_CHANNEL_COMPLETE(open_chan); + opal_pointer_array_set_item ( &orte_rml_base.open_channels, open_chan->channel->channel_num, NULL); + OBJ_RELEASE(open_chan->channel); + OBJ_RELEASE(open_chan); + } + +} + +void orte_rml_base_open_channel_send_callback ( int status, + orte_process_name_t* sender, + opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + // this is the send call back for open channel request + orte_rml_open_channel_t *req = (orte_rml_open_channel_t*) cbdata; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel_send_callback to peer %s status = %d", + ORTE_NAME_PRINT(sender), + ORTE_NAME_PRINT(&req->dst), status)); + // if the message was not sent we should retry or complete the request appropriately + if (status!= ORTE_SUCCESS) + { + req->status = status; + ORTE_RML_OPEN_CHANNEL_COMPLETE(req); + opal_pointer_array_set_item ( &orte_rml_base.open_channels, req->channel->channel_num, NULL); + // call QoS module to release the QoS channel object. + orte_qos_close_channel (req->channel->qos, req->channel->qos_channel_ptr); + OBJ_RELEASE(req->channel); + OBJ_RELEASE(req); + } + else { + // start a timer for response from peer + } + OBJ_RELEASE(buffer); +} + +void orte_rml_base_open_channel_resp_callback (int status, + orte_process_name_t* peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + orte_rml_open_channel_t *req = (orte_rml_open_channel_t*) cbdata; + orte_rml_channel_t * channel = req->channel; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel_resp_callback to peer %s status = %d channel = %p", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(peer), status, + (void*)channel)); + int32_t rc; + bool peer_resp = false; + int32_t count = 1; + // unpack peer response from buffer to determine if peer has accepted the open request + if ((ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &peer_resp, &count, OPAL_BOOL))) && peer_resp) { + + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel_resp_callback to peer response = %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + peer_resp)); + /* response will contain the peer channel number - the peer does not have the + option to change the channel attributes + unpack and get peer channel number.*/ + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &channel->peer_channel, &count, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + req->status = ORTE_ERR_UNPACK_FAILURE; + opal_pointer_array_set_item ( &orte_rml_base.open_channels, req->channel->channel_num, NULL); + // call QoS module to release the QoS channel object. + orte_qos_close_channel (req->channel->qos, req->channel->qos_channel_ptr); + OBJ_RELEASE(req->channel); + // TBD : should we send a close channel to the peer?? + } + else { + // call qos module to update the channel state.?? + req->status = ORTE_SUCCESS; + req->channel->state = orte_rml_channel_open; + } + } + else { + if (rc) { + ORTE_ERROR_LOG(rc); + req->status = ORTE_ERR_UNPACK_FAILURE; + } else { + req->status = ORTE_ERR_OPEN_CHANNEL_PEER_REJECT; + } + opal_pointer_array_set_item ( &orte_rml_base.open_channels, req->channel->channel_num, NULL); + // call QoS module to release the QoS channel object. + orte_qos_close_channel (req->channel->qos, req->channel->qos_channel_ptr); + OBJ_RELEASE(req->channel); + } + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel_resp_callback to peer %s status = %d channel =%p num = %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(peer), req->status, + (void*)channel, channel->channel_num)); + ORTE_RML_OPEN_CHANNEL_COMPLETE(req); + OBJ_RELEASE(req); +} + +static int unpack_channel_attributes (opal_buffer_t *buffer, + opal_list_t *qos_attributes) +{ + orte_attribute_t *kv; + int32_t count, n, k; + int32_t rc=ORTE_SUCCESS; + /* unpack the attributes */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &count, + &n, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_unpack_attributes num attributes = %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + count)); + for (k=0; k < count; k++) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &kv, + &n, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "rml_unpack_attributes unpacked attribute key = %d, value = %d ", + kv->key, + kv->data.uint8)); + kv->local = ORTE_ATTR_GLOBAL; + opal_list_append(qos_attributes, &kv->super); + } + return rc; +} + +void orte_rml_open_channel_recv_callback (int status, + orte_process_name_t* peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + opal_list_t qos_attributes; + orte_rml_channel_t *channel; + uint8_t *type, type_val = 10; + int32_t count =1; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel_recv_callback from peer %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(peer))); + OBJ_CONSTRUCT(&qos_attributes, opal_list_t); + /* unpack attributes first */ + if ( ORTE_SUCCESS == unpack_channel_attributes( buffer, &qos_attributes)) { + type = &type_val; + orte_get_attribute( &qos_attributes, ORTE_QOS_TYPE, (void**)&type, OPAL_UINT8); + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "rml_open_channel_recv_callback type =%d", + type_val)); + /* scan the list of channels to see if we already have a channel with qos_attributes */ + if (NULL == (channel = get_channel ( peer, &qos_attributes, true))) { + /* create a new channel for the req */ + channel = OBJ_NEW(orte_rml_channel_t); + channel->channel_num = opal_pointer_array_add (&orte_rml_base.open_channels, channel); + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "rml_open_channel_recv_callback channel num =%d", + channel->channel_num)); + channel->peer = *peer; + channel->recv = true; + channel->qos = (void*) orte_qos_get_module (&qos_attributes); + /* now associate qos with the channel based on requested attributes */ + channel->qos_channel_ptr = (void*) orte_qos_create_channel(channel->qos, &qos_attributes, + channel->channel_num); + if (channel->qos_channel_ptr) { + /* call qos to init recv state */ + orte_qos_init_recv_channel ( channel->qos, channel->qos_channel_ptr, &qos_attributes); + /* send channel accept reply to sender */ + if(ORTE_SUCCESS == send_open_channel_reply (peer, channel, true)) { + /* update channel state */ + channel->state = orte_rml_channel_open; + /*store src channel number */ + opal_dss.unpack(buffer, (void*) &channel->peer_channel, &count, OPAL_UINT32); + } + else { + /* the receiver shall not attempt to resend or send a reject message + instead we let the sender's request timeout at his end. + release the channel etc */ + opal_pointer_array_set_item ( &orte_rml_base.open_channels, channel->channel_num, NULL); + orte_qos_close_channel (channel->qos, channel->qos_channel_ptr); + OBJ_RELEASE(channel); + } + } else { + send_open_channel_reply (peer, NULL, false); + opal_pointer_array_set_item ( &orte_rml_base.open_channels, channel->channel_num, NULL); + //orte_qos_close_channel (channel->qos, channel->qos_channel_ptr); + OBJ_RELEASE(channel); + } + } + else { + /* there exists a channel with the same attributes reject the request */ + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "rml_open_channel_recv_callback OOPS CHANNEL EXISTS ALREADY channel num =%d", + channel->channel_num)); + send_open_channel_reply (peer, channel, false); + } + } + else { + //reply with error message + send_open_channel_reply (peer, NULL, false); + } +} + +static int send_open_channel_reply (orte_process_name_t *peer, + orte_rml_channel_t *channel, + bool accept) +{ + opal_buffer_t *buffer; + int32_t rc; + buffer = OBJ_NEW (opal_buffer_t); + if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, &accept , 1, OPAL_BOOL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (accept) { + if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, &channel->channel_num , 1, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* TBD: should specify reason for reject + send open channel response to sender */ + orte_rml.send_buffer_nb ( peer, buffer, ORTE_RML_TAG_OPEN_CHANNEL_RESP, + orte_rml_base_open_channel_reply_send_callback, + channel); + + return rc; +} + +static orte_rml_channel_t * get_channel ( orte_process_name_t * peer, + opal_list_t *qos_attributes, + bool recv) +{ + orte_rml_channel_t *channel = NULL; + int32_t i = 0; + /* search available channels and return channel that matches the attributes */ + for (i=0; i < orte_rml_base.open_channels.size; i++) { + if (NULL != (channel = (orte_rml_channel_t*) opal_pointer_array_get_item (&orte_rml_base.open_channels, i))) { + /* compare basic properties */ + if ((OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &channel->peer, peer)) && + ((orte_rml_channel_open == channel->state) || + (orte_rml_channel_opening == channel->state)) && + (channel->recv == recv)) + { + /* compare channel attributes */ + if( ORTE_SUCCESS == orte_qos_cmp_channel ( channel->qos, channel->qos_channel_ptr, qos_attributes)) + return channel; + + } + } + } + return NULL; +} + +void orte_rml_base_open_channel_reply_send_callback ( int status, + orte_process_name_t* sender, + opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + // this is the send call back for open channel reply + orte_rml_channel_t *channel = (orte_rml_channel_t*) cbdata; + // if the message was not sent we should retry or release the channel resources + if (status!= ORTE_SUCCESS) + { + ORTE_ERROR_LOG (status); + // release channel + if(NULL != channel) { + opal_pointer_array_set_item ( &orte_rml_base.open_channels, channel->channel_num, NULL); + // call QoS module to release the QoS channel object. + orte_qos_close_channel (channel->qos, channel->qos_channel_ptr); + OBJ_RELEASE(channel); + } else { + // we did not accept the request so nothing to do + } + } + // if success then release the buffer and do open channel request completion after receiving response from peer + OBJ_RELEASE(buffer); +} + +orte_rml_channel_t * orte_rml_base_get_channel (orte_rml_channel_num_t chan_num) { + orte_rml_channel_t * channel; + + channel = (orte_rml_channel_t*) opal_pointer_array_get_item (&orte_rml_base.open_channels, chan_num); + if ((NULL != channel) && (orte_rml_channel_open == channel->state)) + return channel; + else + return NULL; + return channel; +} + +void orte_rml_base_prep_send_channel (orte_rml_channel_t *channel, + orte_rml_send_t *send) +{ + // add channel number and notify Qos + send->dst_channel = channel->peer_channel; + orte_qos_send_channel (channel->qos, channel->qos_channel_ptr, send); +} + +int orte_rml_base_process_recv_channel (orte_rml_channel_t *channel, + orte_rml_recv_t *recv) +{ + // call qos for recv post processing + return (orte_qos_recv_channel (channel->qos, channel->qos_channel_ptr, recv)); +} + +void orte_rml_close_channel_recv_callback (int status, + orte_process_name_t* peer, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + // find the channel and close it or log error + orte_rml_channel_t *channel; + int32_t count =1, rc; + orte_rml_channel_num_t channel_num =5; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_close_channel_recv_callback from peer %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(peer))); + /* unpack channel number */ + if(ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &channel_num, + &count, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + return; + } + channel = orte_rml_base_get_channel(channel_num); + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_close_channel_recv_callback for channel num =%d channel=%p", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + channel_num, (void*)channel)); + if (NULL != channel) { + orte_qos_close_channel ( channel->qos, channel->qos_channel_ptr); + opal_pointer_array_set_item ( &orte_rml_base.open_channels, channel->channel_num, NULL); + OBJ_RELEASE(channel); + } else { + ORTE_ERROR_LOG(OPAL_ERR_BAD_PARAM); + } +} diff --git a/orte/mca/rml/base/rml_base_frame.c b/orte/mca/rml/base/rml_base_frame.c index e4d14b0fc3d..0d763a3adc8 100644 --- a/orte/mca/rml/base/rml_base_frame.c +++ b/orte/mca/rml/base/rml_base_frame.c @@ -3,13 +3,13 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights - * reserved. + * reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 -2015 Intel Corporation. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -48,7 +48,7 @@ static int orte_rml_base_register(mca_base_register_flag_t flags) { int var_id; - /* + /* * Which RML Wrapper component to use, if any * - NULL or "" = No wrapper * - ow. select that specific wrapper component @@ -84,6 +84,7 @@ static int orte_rml_base_close(void) OBJ_DESTRUCT(&orte_rml_base.posted_recvs); OPAL_TIMING_REPORT(orte_rml_base.timing, &tm_rml); + OBJ_DESTRUCT(&orte_rml_base.open_channels); return mca_base_framework_components_close(&orte_rml_base_framework, NULL); } @@ -93,6 +94,11 @@ static int orte_rml_base_open(mca_base_open_flag_t flags) /* Initialize globals */ OBJ_CONSTRUCT(&orte_rml_base.posted_recvs, opal_list_t); OBJ_CONSTRUCT(&orte_rml_base.unmatched_msgs, opal_list_t); + OBJ_CONSTRUCT(&orte_rml_base.open_channels, opal_pointer_array_t); + if (OPAL_SUCCESS != opal_pointer_array_init(&orte_rml_base.open_channels, 0, + INT_MAX, 1)) { + return ORTE_ERR_OUT_OF_RESOURCE; + } OPAL_TIMING_INIT(&tm_rml); /* Open up all available components */ return mca_base_framework_components_open(&orte_rml_base_framework, flags); @@ -124,13 +130,13 @@ int orte_rml_base_select(void) orte_rml_component_t* component; component = (orte_rml_component_t *) cli->cli_component; - opal_output_verbose(10, orte_rml_base_framework.framework_output, + opal_output_verbose(10, orte_rml_base_framework.framework_output, "orte_rml_base_select: initializing %s component %s", component->rml_version.mca_type_name, component->rml_version.mca_component_name); if (NULL == component->rml_init) { - opal_output_verbose(10, orte_rml_base_framework.framework_output, + opal_output_verbose(10, orte_rml_base_framework.framework_output, "orte_rml_base_select: no init function; ignoring component"); } else { int priority = 0; @@ -148,7 +154,7 @@ int orte_rml_base_select(void) if(NULL != orte_rml_base_wrapper && /* If this is a wrapper component then save it for later */ RML_SELECT_WRAPPER_PRIORITY >= priority) { - if( 0 == strncmp(component->rml_version.mca_component_name, + if( 0 == strncmp(component->rml_version.mca_component_name, orte_rml_base_wrapper, strlen(orte_rml_base_wrapper) ) ) { wrapper_component = component; @@ -158,7 +164,6 @@ int orte_rml_base_select(void) if (NULL != selected_module && NULL != selected_module->finalize) { selected_module->finalize(); } - selected_priority = priority; selected_component = component; selected_module = module; @@ -166,7 +171,7 @@ int orte_rml_base_select(void) } } - /* + /* * Unload all components that were not selected */ OPAL_LIST_FOREACH_SAFE(item, next, &orte_rml_base_framework.framework_components, opal_list_item_t) { @@ -192,7 +197,7 @@ int orte_rml_base_select(void) orte_rml_component = selected_component; } - /* If a wrapper component was requested then + /* If a wrapper component was requested then * Make sure it can switch out the selected module */ if( NULL != wrapper_component) { @@ -205,7 +210,14 @@ int orte_rml_base_select(void) } return ORTE_ERROR; } - + /* Post a persistent recieve for open channel request */ + orte_rml.recv_buffer_nb (ORTE_NAME_WILDCARD, ORTE_RML_TAG_OPEN_CHANNEL_REQ, + ORTE_RML_PERSISTENT, orte_rml_open_channel_recv_callback, + NULL); + /* post a persistent recieve for close channel request */ + orte_rml.recv_buffer_nb (ORTE_NAME_WILDCARD, ORTE_RML_TAG_CLOSE_CHANNEL_REQ, + ORTE_RML_PERSISTENT, orte_rml_close_channel_recv_callback, + NULL); return ORTE_SUCCESS; } @@ -235,6 +247,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender, blob->active = false; } + /*** RML CLASS INSTANCES ***/ static void send_cons(orte_rml_send_t *ptr) { @@ -242,14 +255,48 @@ static void send_cons(orte_rml_send_t *ptr) ptr->iov = NULL; ptr->buffer = NULL; ptr->data = NULL; + ptr->channel = NULL; + ptr->dst_channel = ORTE_RML_INVALID_CHANNEL_NUM; + ptr->seq_num = 0xFFFFFFFF; } OBJ_CLASS_INSTANCE(orte_rml_send_t, opal_list_item_t, send_cons, NULL); +static void channel_cons(orte_rml_channel_t *ptr) +{ + ptr->channel_num = ORTE_RML_INVALID_CHANNEL_NUM; + ptr->qos = NULL; + ptr->qos_channel_ptr = NULL; + ptr->recv = false; +} + +OBJ_CLASS_INSTANCE(orte_rml_channel_t, + opal_list_item_t, + channel_cons, NULL); + +static void open_channel_cons(orte_rml_open_channel_t *ptr) +{ + ptr->cbdata = NULL; + ptr->qos_attributes = NULL; +} +OBJ_CLASS_INSTANCE(orte_rml_open_channel_t, + opal_list_item_t, + open_channel_cons, NULL); + +static void close_channel_cons(orte_rml_close_channel_t *ptr) +{ + ptr->cbdata = NULL; + ptr->channel = NULL; +} +OBJ_CLASS_INSTANCE(orte_rml_close_channel_t, + opal_list_item_t, + close_channel_cons, NULL); + static void send_req_cons(orte_rml_send_request_t *ptr) { - OBJ_CONSTRUCT(&ptr->post, orte_rml_send_t); + OBJ_CONSTRUCT(&ptr->post.send, orte_rml_send_t); + OBJ_CONSTRUCT(&ptr->post.open_channel, orte_rml_open_channel_t); } OBJ_CLASS_INSTANCE(orte_rml_send_request_t, opal_object_t, @@ -259,6 +306,7 @@ static void recv_cons(orte_rml_recv_t *ptr) { ptr->iov.iov_base = NULL; ptr->iov.iov_len = 0; + ptr->channel_num = ORTE_RML_INVALID_CHANNEL_NUM; } static void recv_des(orte_rml_recv_t *ptr) { @@ -304,4 +352,3 @@ static void prq_des(orte_rml_recv_request_t *ptr) OBJ_CLASS_INSTANCE(orte_rml_recv_request_t, opal_object_t, prq_cons, prq_des); - diff --git a/orte/mca/rml/base/rml_base_msg_handlers.c b/orte/mca/rml/base/rml_base_msg_handlers.c index 9e01ae38fd0..f35e4922c83 100644 --- a/orte/mca/rml/base/rml_base_msg_handlers.c +++ b/orte/mca/rml/base/rml_base_msg_handlers.c @@ -11,7 +11,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights - * reserved. + * reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -47,9 +48,12 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/base.h" #include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/qos/base/base.h" + static void msg_match_recv(orte_rml_posted_recv_t *rcv, bool get_all); + void orte_rml_base_post_recv(int sd, short args, void *cbdata) { orte_rml_recv_request_t *req = (orte_rml_recv_request_t*)cbdata; @@ -116,6 +120,76 @@ void orte_rml_base_post_recv(int sd, short args, void *cbdata) OBJ_RELEASE(req); } +void orte_rml_base_complete_recv_msg (orte_rml_recv_t **recv_msg) +{ + orte_rml_posted_recv_t *post; + orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD; + opal_buffer_t buf; + orte_rml_recv_t *msg = *recv_msg; + /* see if we have a waiting recv for this message */ + OPAL_LIST_FOREACH(post, &orte_rml_base.posted_recvs, orte_rml_posted_recv_t) { + /* since names could include wildcards, must use + * the more generalized comparison function + */ + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &msg->sender, &post->peer) && + msg->tag == post->tag) { + /* deliver the data to this location */ + if (post->buffer_data) { + /* deliver it in a buffer */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + opal_dss.load(&buf, msg->iov.iov_base, msg->iov.iov_len); + /* xfer ownership of the malloc'd data to the buffer */ + msg->iov.iov_base = NULL; + post->cbfunc.buffer(ORTE_SUCCESS, &msg->sender, &buf, msg->tag, post->cbdata); + /* the user must have unloaded the buffer if they wanted + * to retain ownership of it, so release whatever remains + */ + OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, + "%s message received bytes from %s for tag %d on channel=%d called callback", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&msg->sender), + msg->tag, + msg->channel_num)); + OBJ_DESTRUCT(&buf); + } else { + /* deliver as an iovec */ + post->cbfunc.iov(ORTE_SUCCESS, &msg->sender, &msg->iov, 1, msg->tag, post->cbdata); + /* the user should have shifted the data to + * a local variable and NULL'd the iov_base + * if they wanted ownership of the data + */ + } + /* release the message */ + OBJ_RELEASE(msg); + OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, + "%s message tag %d on released", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + post->tag)); + /* if the recv is non-persistent, remove it */ + if (!post->persistent) { + opal_list_remove_item(&orte_rml_base.posted_recvs, &post->super); + /*OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, + "%s non persistent recv %p remove success releasing now", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + post));*/ + OBJ_RELEASE(post); + + } + return; + } + } + /* we get here if no matching recv was found - we then hold + * the message until such a recv is issued + */ + OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, + "%s message received bytes from %s for tag %d on channel=%d Not Matched adding to unmatched msgs", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&msg->sender), + msg->tag, + msg->channel_num)); + opal_list_append(&orte_rml_base.unmatched_msgs, &msg->super); +} + static void msg_match_recv(orte_rml_posted_recv_t *rcv, bool get_all) { opal_list_item_t *item, *next; @@ -141,7 +215,7 @@ static void msg_match_recv(orte_rml_posted_recv_t *rcv, bool get_all) */ if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &msg->sender, &rcv->peer) && msg->tag == rcv->tag) { - ORTE_RML_ACTIVATE_MESSAGE(msg); + ORTE_RML_REACTIVATE_MESSAGE(msg); opal_list_remove_item(&orte_rml_base.unmatched_msgs, item); if (!get_all) { break; @@ -154,60 +228,46 @@ static void msg_match_recv(orte_rml_posted_recv_t *rcv, bool get_all) void orte_rml_base_process_msg(int fd, short flags, void *cbdata) { orte_rml_recv_t *msg = (orte_rml_recv_t*)cbdata; - orte_rml_posted_recv_t *post; - orte_ns_cmp_bitmask_t mask = ORTE_NS_CMP_ALL | ORTE_NS_CMP_WILD; - opal_buffer_t buf; - OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, - "%s message received %d bytes from %s for tag %d", + "%s message received from %s for tag %d on channel=%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (int)msg->iov.iov_len, ORTE_NAME_PRINT(&msg->sender), - msg->tag)); + msg->tag, + msg->channel_num)); OPAL_TIMING_EVENT((&tm_rml,"from %s %d bytes", ORTE_NAME_PRINT(&msg->sender), msg->iov.iov_len)); + if ((ORTE_RML_INVALID_CHANNEL_NUM != msg->channel_num) && + (NULL != orte_rml_base_get_channel(msg->channel_num) )) { - /* see if we have a waiting recv for this message */ - OPAL_LIST_FOREACH(post, &orte_rml_base.posted_recvs, orte_rml_posted_recv_t) { - /* since names could include wildcards, must use - * the more generalized comparison function - */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &msg->sender, &post->peer) && - msg->tag == post->tag) { - /* deliver the data to this location */ - if (post->buffer_data) { - /* deliver it in a buffer */ - OBJ_CONSTRUCT(&buf, opal_buffer_t); - opal_dss.load(&buf, msg->iov.iov_base, msg->iov.iov_len); - /* xfer ownership of the malloc'd data to the buffer */ - msg->iov.iov_base = NULL; - post->cbfunc.buffer(ORTE_SUCCESS, &msg->sender, &buf, msg->tag, post->cbdata); - /* the user must have unloaded the buffer if they wanted - * to retain ownership of it, so release whatever remains - */ - OBJ_DESTRUCT(&buf); - } else { - /* deliver as an iovec */ - post->cbfunc.iov(ORTE_SUCCESS, &msg->sender, &msg->iov, 1, msg->tag, post->cbdata); - /* the user should have shifted the data to - * a local variable and NULL'd the iov_base - * if they wanted ownership of the data - */ - } - /* release the message */ - OBJ_RELEASE(msg); - /* if the recv is non-persistent, remove it */ - if (!post->persistent) { - opal_list_remove_item(&orte_rml_base.posted_recvs, &post->super); - OBJ_RELEASE(post); - } + // call channel for recv post processing + if (ORTE_SUCCESS != (orte_rml_base_process_recv_channel (orte_rml_base_get_channel(msg->channel_num), msg))) + { + /* the qos channel has determined an error so we cannot complete this msg to the caller */ + OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, + "%s QoS channel receive error - cannot complete msg on channel=%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + msg->channel_num)); return; } + } + orte_rml_base_complete_recv_msg (&msg); +} - /* we get here if no matching recv was found - we then hold - * the message until such a recv is issued - */ - opal_list_append(&orte_rml_base.unmatched_msgs, &msg->super); +void orte_rml_base_reprocess_msg(int fd, short flags, void *cbdata) +{ + orte_rml_recv_t *msg = (orte_rml_recv_t*)cbdata; + OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, + "%s reprocessing msg received from %s for tag %d on channel=%d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&msg->sender), + msg->tag, + msg->channel_num)); + + OPAL_TIMING_EVENT((&tm_rml,"from %s %d bytes", + ORTE_NAME_PRINT(&msg->sender), msg->iov.iov_len)); + orte_rml_base_complete_recv_msg ( &msg); + /* the msg should be matched and released in this path + add an assert (msg!= NULL) ?? */ } diff --git a/orte/mca/rml/base/rml_base_receive.c b/orte/mca/rml/base/rml_base_receive.c index bc77fbe3de9..3fbc2516c72 100644 --- a/orte/mca/rml/base/rml_base_receive.c +++ b/orte/mca/rml/base/rml_base_receive.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights - * reserved. + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,7 +55,6 @@ void orte_rml_base_comm_start(void) if (recv_issued) { return; } - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RML_INFO_UPDATE, ORTE_RML_PERSISTENT, @@ -70,7 +69,6 @@ void orte_rml_base_comm_stop(void) if (!recv_issued) { return; } - orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RML_INFO_UPDATE); recv_issued = false; } @@ -88,19 +86,17 @@ orte_rml_base_recv(int status, orte_process_name_t* sender, orte_std_cntr_t count; opal_buffer_t *buf; int rc; - OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, "%s rml:base:recv: processing message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); - count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_RML_CMD))) { ORTE_ERROR_LOG(rc); return; } - + switch (command) { case ORTE_RML_UPDATE_CMD: if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(buffer))) { @@ -108,11 +104,9 @@ orte_rml_base_recv(int status, orte_process_name_t* sender, return; } break; - default: ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); } - /* send an ack back - this is REQUIRED to ensure that the routing * info gets updated -before- a message intending to use that info * arrives. Because message ordering is NOT preserved in the OOB, it @@ -124,7 +118,6 @@ orte_rml_base_recv(int status, orte_process_name_t* sender, "%s rml:base:recv: sending ack to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); - buf = OBJ_NEW(opal_buffer_t); if (0 > (rc = orte_rml.send_buffer_nb(sender, buf, ORTE_RML_TAG_UPDATE_ROUTE_ACK, orte_rml_send_callback, NULL))) { diff --git a/orte/mca/rml/ftrm/rml_ftrm_component.c b/orte/mca/rml/ftrm/rml_ftrm_component.c index 9b0d86795f5..17ea19cf879 100644 --- a/orte/mca/rml/ftrm/rml_ftrm_component.c +++ b/orte/mca/rml/ftrm/rml_ftrm_component.c @@ -6,16 +6,16 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -80,8 +80,7 @@ orte_rml_module_t orte_rml_ftrm_module = { orte_rml_ftrm_del_exception_handler, orte_rml_ftrm_ft_event, - - orte_rml_ftrm_purge + orte_rml_ftrm_purge, }; int rml_ftrm_output_handle; diff --git a/orte/mca/rml/oob/rml_oob.h b/orte/mca/rml/oob/rml_oob.h index 7a8fdf1f09b..6c7741ac39a 100644 --- a/orte/mca/rml/oob/rml_oob.h +++ b/orte/mca/rml/oob/rml_oob.h @@ -5,7 +5,7 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. @@ -14,10 +14,11 @@ * All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2014 -2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -75,10 +76,32 @@ void orte_rml_oob_recv_buffer_nb(orte_process_name_t* peer, orte_rml_buffer_callback_fn_t cbfunc, void* cbdata); -void orte_rml_oob_recv_cancel(orte_process_name_t* peer, +void orte_rml_oob_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag); -int orte_rml_oob_ping(const char* uri, +int orte_rml_oob_open_channel(orte_process_name_t * peer, + opal_list_t * qos_attributes, + orte_rml_channel_callback_fn_t cbfunc, + void *cbdata); + +int orte_rml_oob_send_channel_nb (orte_rml_channel_num_t channel, + struct iovec* msg, + int count, + orte_rml_tag_t tag, + orte_rml_send_channel_callback_fn_t cbfunc, + void* cbdata); + +int orte_rml_oob_send_buffer_channel_nb (orte_rml_channel_num_t channel, + opal_buffer_t *buffer, + orte_rml_tag_t tag, + orte_rml_send_buffer_channel_callback_fn_t cbfunc, + void* cbdata); + +int orte_rml_oob_close_channel (orte_rml_channel_num_t channel, + orte_rml_channel_callback_fn_t cbfunc, + void* cbdata); + +int orte_rml_oob_ping(const char* uri, const struct timeval* tv); char* orte_rml_oob_get_uri(void); diff --git a/orte/mca/rml/oob/rml_oob_component.c b/orte/mca/rml/oob/rml_oob_component.c index 8ec17ccaaaa..e7ba6e0516f 100644 --- a/orte/mca/rml/oob/rml_oob_component.c +++ b/orte/mca/rml/oob/rml_oob_component.c @@ -6,17 +6,18 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2015 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -97,10 +98,13 @@ orte_rml_oob_module_t orte_rml_oob_module = { orte_rml_oob_add_exception, orte_rml_oob_del_exception, - orte_rml_oob_ft_event, - - orte_rml_oob_purge + orte_rml_oob_purge, + + orte_rml_oob_open_channel, + orte_rml_oob_send_channel_nb, + orte_rml_oob_send_buffer_channel_nb, + orte_rml_oob_close_channel } }; @@ -127,11 +131,11 @@ rml_oob_init(int* priority) *priority = 1; return &orte_rml_oob_module.super; } - + *priority = 1; - + OBJ_CONSTRUCT(&orte_rml_oob_module.exceptions, opal_list_t); - + init_done = true; return &orte_rml_oob_module.super; } @@ -141,7 +145,7 @@ orte_rml_oob_init(void) { /* enable the base receive to get updates on contact info */ orte_rml_base_comm_start(); - + return ORTE_SUCCESS; } @@ -151,7 +155,7 @@ orte_rml_oob_fini(void) { opal_list_item_t *item; - while (NULL != + while (NULL != (item = opal_list_remove_first(&orte_rml_oob_module.exceptions))) { OBJ_RELEASE(item); } @@ -159,7 +163,7 @@ orte_rml_oob_fini(void) /* clear the base receive */ orte_rml_base_comm_stop(); - + return ORTE_SUCCESS; } diff --git a/orte/mca/rml/oob/rml_oob_send.c b/orte/mca/rml/oob/rml_oob_send.c index 3331856b708..92ebe891666 100644 --- a/orte/mca/rml/oob/rml_oob_send.c +++ b/orte/mca/rml/oob/rml_oob_send.c @@ -6,17 +6,17 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -34,7 +34,7 @@ #include "orte/mca/rml/base/base.h" #include "orte/mca/rml/rml_types.h" #include "rml_oob.h" - +#include "orte/mca/qos/base/base.h" typedef struct { opal_object_t object; opal_event_t ev; @@ -74,13 +74,13 @@ static void send_self_exe(int fd, short args, void* data) if (NULL != xfer->iov) { if (NULL != xfer->cbfunc.iov) { /* non-blocking iovec send */ - xfer->cbfunc.iov(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->iov, xfer->count, + xfer->cbfunc.iov(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->iov, xfer->count, xfer->tag, xfer->cbdata); } } else if (NULL != xfer->buffer) { if (NULL != xfer->cbfunc.buffer) { /* non-blocking buffer send */ - xfer->cbfunc.buffer(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->buffer, + xfer->cbfunc.buffer(ORTE_SUCCESS, ORTE_PROC_MY_NAME, xfer->buffer, xfer->tag, xfer->cbdata); } } else { @@ -95,8 +95,8 @@ static void send_self_exe(int fd, short args, void* data) static void send_msg(int fd, short args, void *cbdata) { orte_rml_send_request_t *req = (orte_rml_send_request_t*)cbdata; - orte_process_name_t *peer = &(req->post.dst); - orte_rml_tag_t tag = req->post.tag; + orte_process_name_t *peer = &(req->post.send.dst); + orte_rml_tag_t tag = req->post.send.tag; orte_rml_recv_t *rcv; orte_rml_send_t *snd; int bytes; @@ -135,16 +135,16 @@ static void send_msg(int fd, short args, void *cbdata) /* setup the send callback */ xfer = OBJ_NEW(orte_self_send_xfer_t); - if (NULL != req->post.iov) { - xfer->iov = req->post.iov; - xfer->count = req->post.count; - xfer->cbfunc.iov = req->post.cbfunc.iov; + if (NULL != req->post.send.iov) { + xfer->iov = req->post.send.iov; + xfer->count = req->post.send.count; + xfer->cbfunc.iov = req->post.send.cbfunc.iov; } else { - xfer->buffer = req->post.buffer; - xfer->cbfunc.buffer = req->post.cbfunc.buffer; + xfer->buffer = req->post.send.buffer; + xfer->cbfunc.buffer = req->post.send.cbfunc.buffer; } xfer->tag = tag; - xfer->cbdata = req->post.cbdata; + xfer->cbdata = req->post.send.cbdata; /* setup the event for the send callback */ opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer); opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI); @@ -154,11 +154,11 @@ static void send_msg(int fd, short args, void *cbdata) rcv = OBJ_NEW(orte_rml_recv_t); rcv->sender = *peer; rcv->tag = tag; - if (NULL != req->post.iov) { + if (NULL != req->post.send.iov) { /* get the total number of bytes in the iovec array */ bytes = 0; - for (i = 0 ; i < req->post.count ; ++i) { - bytes += req->post.iov[i].iov_len; + for (i = 0 ; i < req->post.send.count ; ++i) { + bytes += req->post.send.iov[i].iov_len; } /* get the required memory allocation */ if (0 < bytes) { @@ -166,15 +166,15 @@ static void send_msg(int fd, short args, void *cbdata) rcv->iov.iov_len = bytes; /* transfer the bytes */ ptr = (char*)rcv->iov.iov_base; - for (i = 0 ; i < req->post.count ; ++i) { - memcpy(ptr, req->post.iov[i].iov_base, req->post.iov[i].iov_len); - ptr += req->post.iov[i].iov_len; + for (i = 0 ; i < req->post.send.count ; ++i) { + memcpy(ptr, req->post.send.iov[i].iov_base, req->post.send.iov[i].iov_len); + ptr += req->post.send.iov[i].iov_len; } } - } else if (0 < req->post.buffer->bytes_used) { - rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(req->post.buffer->bytes_used); - memcpy(rcv->iov.iov_base, req->post.buffer->base_ptr, req->post.buffer->bytes_used); - rcv->iov.iov_len = req->post.buffer->bytes_used; + } else if (0 < req->post.send.buffer->bytes_used) { + rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(req->post.send.buffer->bytes_used); + memcpy(rcv->iov.iov_base, req->post.send.buffer->base_ptr, req->post.send.buffer->bytes_used); + rcv->iov.iov_len = req->post.send.buffer->bytes_used; } /* post the message for receipt - since the send callback was posted * first and has the same priority, it will execute first @@ -188,16 +188,24 @@ static void send_msg(int fd, short args, void *cbdata) snd->dst = *peer; snd->origin = *ORTE_PROC_MY_NAME; snd->tag = tag; - if (NULL != req->post.iov) { - snd->iov = req->post.iov; - snd->count = req->post.count; - snd->cbfunc.iov = req->post.cbfunc.iov; + if (NULL != req->post.send.iov) { + snd->iov = req->post.send.iov; + snd->count = req->post.send.count; + snd->cbfunc.iov = req->post.send.cbfunc.iov; } else { - snd->buffer = req->post.buffer; - snd->cbfunc.buffer = req->post.cbfunc.buffer; + snd->buffer = req->post.send.buffer; + snd->cbfunc.buffer = req->post.send.cbfunc.buffer; + } + snd->cbdata = req->post.send.cbdata; + snd->channel = req->post.send.channel; + /* call send prep to prep the Qos channel for send */ + if (NULL != snd->channel) + { + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s send_msg sending on channel %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), snd->channel->channel_num)); + orte_rml_base_prep_send_channel (snd->channel, snd); } - snd->cbdata = req->post.cbdata; - /* activate the OOB send state */ ORTE_OOB_SEND(snd); @@ -205,6 +213,7 @@ static void send_msg(int fd, short args, void *cbdata) } + int orte_rml_oob_send_nb(orte_process_name_t* peer, struct iovec* iov, int count, @@ -224,24 +233,22 @@ int orte_rml_oob_send_nb(orte_process_name_t* peer, ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } - if( NULL == peer || - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) { + OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) { /* cannot send to an invalid peer */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } - /* get ourselves into an event to protect against * race conditions and threads */ req = OBJ_NEW(orte_rml_send_request_t); - req->post.dst = *peer; - req->post.iov = iov; - req->post.count = count; - req->post.tag = tag; - req->post.cbfunc.iov = cbfunc; - req->post.cbdata = cbdata; + req->post.send.dst = *peer; + req->post.send.iov = iov; + req->post.send.count = count; + req->post.send.tag = tag; + req->post.send.cbfunc.iov = cbfunc; + req->post.send.cbdata = cbdata; /* setup the event for the send callback */ opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); opal_event_set_priority(&req->ev, ORTE_MSG_PRI); @@ -269,23 +276,21 @@ int orte_rml_oob_send_buffer_nb(orte_process_name_t* peer, ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } - - if (NULL == peer || - OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) { + if( NULL == peer || + OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) { /* cannot send to an invalid peer */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } - /* get ourselves into an event to protect against * race conditions and threads */ req = OBJ_NEW(orte_rml_send_request_t); - req->post.dst = *peer; - req->post.buffer = buffer; - req->post.tag = tag; - req->post.cbfunc.buffer = cbfunc; - req->post.cbdata = cbdata; + req->post.send.dst = *peer; + req->post.send.buffer = buffer; + req->post.send.tag = tag; + req->post.send.cbfunc.buffer = cbfunc; + req->post.send.cbdata = cbdata; /* setup the event for the send callback */ opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); opal_event_set_priority(&req->ev, ORTE_MSG_PRI); @@ -293,3 +298,145 @@ int orte_rml_oob_send_buffer_nb(orte_process_name_t* peer, return ORTE_SUCCESS; } + +int orte_rml_oob_open_channel(orte_process_name_t * peer, + opal_list_t *qos_attributes, + orte_rml_channel_callback_fn_t cbfunc, + void *cbdata) +{ + orte_rml_send_request_t *req; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel to peer %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(peer))); + if( NULL == peer || + OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) { + /* cannot send to an invalid peer */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + /* process the request in an event to be safe */ + req = OBJ_NEW(orte_rml_send_request_t); + req->post.open_channel.dst = *peer; + req->post.open_channel.qos_attributes = qos_attributes; + req->post.open_channel.cbfunc = cbfunc; + req->post.open_channel.cbdata = cbdata; + /* setup the event for the open callback */ + opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, orte_rml_base_open_channel, req); + opal_event_set_priority(&req->ev, ORTE_MSG_PRI); + opal_event_active(&req->ev, OPAL_EV_WRITE, 1); + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_open_channel to peer %s - set event done", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(peer))); + return ORTE_SUCCESS; +} + +int orte_rml_oob_send_channel_nb (orte_rml_channel_num_t channel_num, + struct iovec* msg, + int count, + orte_rml_tag_t tag, + orte_rml_send_channel_callback_fn_t cbfunc, + void* cbdata) +{ + orte_rml_send_request_t *req; + orte_rml_channel_t *channel; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_send_buffer to channel %d at tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + channel_num, tag)); + + if (ORTE_RML_TAG_INVALID == tag) { + /* cannot send to an invalid tag */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + channel = (orte_rml_channel_t*) orte_rml_base_get_channel (channel_num); + if (NULL == channel) { + /* cannot send to a non existing or closed channel */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + /* get ourselves into an event to protect against + * race conditions and threads + */ + req = OBJ_NEW(orte_rml_send_request_t); + req->post.send.dst = channel->peer; + req->post.send.iov = msg; + req->post.send.count = count; + req->post.send.tag = tag; + req->post.send.cbfunc.iov_chan = cbfunc; + req->post.send.cbdata = cbdata; + req->post.send.channel = channel; + /* setup the event for the send callback */ + opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); + opal_event_set_priority(&req->ev, ORTE_MSG_PRI); + opal_event_active(&req->ev, OPAL_EV_WRITE, 1); + return ORTE_SUCCESS; +} + +int orte_rml_oob_send_buffer_channel_nb (orte_rml_channel_num_t channel_num, + opal_buffer_t *buffer, + orte_rml_tag_t tag, + orte_rml_send_buffer_channel_callback_fn_t cbfunc, + void* cbdata) +{ + orte_rml_send_request_t *req; + orte_rml_channel_t *channel; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_send_buffer to channel %d at tag %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + channel_num, tag)); + + if (ORTE_RML_TAG_INVALID == tag) { + /* cannot send to an invalid tag */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + channel = (orte_rml_channel_t*) orte_rml_base_get_channel (channel_num); + if (NULL == channel) { + /* cannot send to a non existing or closed channel */ + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + /* get ourselves into an event to protect against + * race conditions and threads + */ + req = OBJ_NEW(orte_rml_send_request_t); + req->post.send.dst = channel->peer; + req->post.send.buffer = buffer; + req->post.send.tag = tag; + req->post.send.cbfunc.buf_chan = cbfunc; + req->post.send.cbdata = cbdata; + req->post.send.channel = channel; + /* setup the event for the send callback */ + opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req); + opal_event_set_priority(&req->ev, ORTE_MSG_PRI); + opal_event_active(&req->ev, OPAL_EV_WRITE, 1); + return ORTE_SUCCESS; +} + +int orte_rml_oob_close_channel (orte_rml_channel_num_t channel_num, + orte_rml_channel_callback_fn_t cbfunc, + void* cbdata) +{ + orte_rml_channel_t *channel; + orte_rml_send_request_t *req; + OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, + "%s rml_close_channel channel num %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + channel_num)); + channel = orte_rml_base_get_channel (channel_num); + if (NULL == channel) + return ORTE_ERR_BAD_PARAM; + /* process the request in an event to be safe */ + req = OBJ_NEW(orte_rml_send_request_t); + req->post.close_channel.channel = channel; + req->post.close_channel.cbfunc = cbfunc; + req->post.close_channel.cbdata = cbdata; + /* setup the event for the open callback */ + opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, orte_rml_base_close_channel, req); + opal_event_set_priority(&req->ev, ORTE_MSG_PRI); + opal_event_active(&req->ev, OPAL_EV_WRITE, 1); + return ORTE_SUCCESS; +} diff --git a/orte/mca/rml/rml.h b/orte/mca/rml/rml.h index 012f5fef00c..2e11a288992 100644 --- a/orte/mca/rml/rml.h +++ b/orte/mca/rml/rml.h @@ -11,7 +11,9 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. + * reserved. + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -19,7 +21,7 @@ * $HEADER$ */ -/** +/** * @file * * Runtime Messaging Layer (RML) Communication Interface @@ -74,11 +76,20 @@ ORTE_DECLSPEC void orte_rml_recv_callback(int status, orte_process_name_t* sende opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata); +ORTE_DECLSPEC void orte_rml_open_channel_recv_callback(int status, + orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata); +ORTE_DECLSPEC void orte_rml_close_channel_recv_callback(int status, + orte_process_name_t* sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, void *cbdata); + /* ******************************************************************** */ /** - * RML component initialization + * RML component initialization * * Create an instance (module) of the given RML component. Upon * returning, the module data structure should be fully populated and @@ -176,7 +187,6 @@ typedef void (*orte_rml_buffer_callback_fn_t)(int status, orte_rml_tag_t tag, void* cbdata); - /** * Function prototype for exception callback * @@ -284,7 +294,7 @@ typedef void (*orte_rml_module_set_contact_info_fn_t)(const char *contact_info); * @param[in] contact_info The contact info string for the remote process * @param[in] tv Timeout after which the ping should be failed * - * @retval ORTE_SUCESS The process is available and will allow connections + * @retval ORTE_SUCESS The process is available and will allow connections * from the local process * @retval ORTE_ERROR An unspecified error occurred during the update */ @@ -388,7 +398,7 @@ typedef void (*orte_rml_module_recv_buffer_nb_fn_t)(orte_process_name_t* peer, * * Attempt to cancel a posted non-blocking receive. * - * @param[in] peer Peer process or ORTE_NAME_WILDCARD, exactly as passed + * @param[in] peer Peer process or ORTE_NAME_WILDCARD, exactly as passed * to the non-blocking receive call * @param[in] tag Posted receive tag */ @@ -429,6 +439,155 @@ typedef int (*orte_rml_module_ft_event_fn_t)(int state); */ typedef void (*orte_rml_module_purge_fn_t)(orte_process_name_t *peer); +/********* NEW RML QOS MESSAGING APIS *****************/ +/***** Questions *****/ +/* +1 : Should we provide a func for the user to get qos attributes of a channel? (do we allow for sets??) +2: Should open channel - have a channel error callback function? +*/ +typedef void (*orte_rml_channel_callback_fn_t) (int status, + orte_rml_channel_num_t channel_num, + orte_process_name_t * peer, + opal_list_t *qos_attributes, + void * cbdata); +/** + * Funtion prototype for callback from non-blocking iovec send on a channel + * + * Funtion prototype for callback from non-blocking iovec send on a channel + * On send, the iovec pointer will be the same pointer passed to + * send_nb and count will equal the count given to send. + * + * + * @note The parameter in/out parameters are relative to the user's callback + * function. + * + * @param[in] status Completion status + * @param[in] channel Opaque channel number on which the msg was sent (input to rml_send_channel) + * @param[in] msg Pointer to the array of iovec that was sent + * or to a single iovec that has been recvd + * @param[in] count Number of iovecs in the array + * @param[in] tag User defined tag for matching send/recv + * @param[in] cbdata User data passed to send_nb() + */ +typedef void (*orte_rml_send_channel_callback_fn_t)(int status, + orte_rml_channel_num_t channel, + struct iovec* msg, + int count, + orte_rml_tag_t tag, + void* cbdata); +/** + * Funtion prototype for callback from non-blocking buffer send on a channel + * + * Function prototype for callback from non-blocking buffer send on a + * channel. On send, the buffer will be the same pointer passed to + * send_buffer_nb. + * + * @note The parameter in/out parameters are relative to the user's callback + * function. + * + * @param[in] status Completion status + * @param[in] channel channel number on which the msg was sent + * @param[in] buffer Message buffer + * @param[in] tag User defined tag for matching send + * @param[in] cbdata User data passed to send_buffer_nb() + */ +typedef void (*orte_rml_send_buffer_channel_callback_fn_t)(int status, + orte_rml_channel_num_t channel, + struct opal_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata); + +/** + * * Open a messaging channel with specified QoS to a specific peer + * + * @param[in] peer End point Peer to which the channel needs to be opened + * @param[in] qos_attributes List of Quality of Service Attributes for the channel + * @param[in] cbfunc Callback function on channel create (open) comlpetion + * @param[in] cbdata User data to provide during completion callback + * + * @retval ORTE_SUCCESS - the channel was successfully created at the source and a request was sent to the dest. + * @retval ORTE_ERROR - unknown error + * @retval ORTE_ERROR_UNSUPPORTED_QOS - the requested QoS cannot be provided. + */ +typedef int (*orte_rml_module_open_channel_fn_t)(orte_process_name_t* peer, + opal_list_t *qos_attributes, + orte_rml_channel_callback_fn_t cbfunc, + void* cbdata); + +/** + * Send an iovec non-blocking message + * + * Send an array of iovecs to the specified peer. The call + * will return immediately, although the iovecs may not be modified + * until the completion callback is triggered. The iovecs *may* be + * passed to another call to send_nb before the completion callback is + * triggered. The callback being triggered does not give any + * indication of remote completion. + * + * @param[in] channel Channel number of the specific channel (given to user in the channel open completion callback fn.) + * @param[in] msg Pointer to an array of iovecs to be sent + * @param[in] count Number of iovecs in array + * @param[in] tag User defined tag for matching send/recv + * @param[in] cbfunc Callback function on message comlpetion + * @param[in] cbdata User data to provide during completion callback + * + * @retval ORTE_SUCCESS The message was successfully started + * @retval ORTE_ERR_BAD_PARAM One of the parameters was invalid + * @retval ORTE_ERR_CHANNEL_UNKNOWN Channel specified does not exist. + * @retval ORTE_ERROR An unspecified error occurred + */ +typedef int (*orte_rml_module_send_channel_nb_fn_t)(orte_rml_channel_num_t channel, + struct iovec* msg, + int count, + orte_rml_tag_t tag, + orte_rml_send_channel_callback_fn_t cbfunc, + void* cbdata); + + +/** + * Send a buffer non-blocking message + * + * Send a buffer on specific prestablished channel. The call + * will return immediately, although the buffer may not be modified + * until the completion callback is triggered. The buffer *may* be + * passed to another call to send_nb before the completion callback is + * triggered. The callback being triggered does not give any + * indication of remote completion. + * + * @param[in] channel Channel number of the specific channel (given to user in the channel open completion callback fn.) + * @param[in] buffer Pointer to buffer to be sent + * @param[in] tag User defined tag for matching send/recv + * @param[in] cbfunc Callback function on message comlpetion + * @param[in] cbdata User data to provide during completion callback + * + * @retval ORTE_SUCCESS The message was successfully started + * @retval ORTE_ERR_BAD_PARAM One of the parameters was invalid + * @retval ORTE_ERR_CHANNEL_UNKNOWN Channel specified does not exist. + * @retval ORTE_ERROR An unspecified error occurred + */ + +typedef int (*orte_rml_module_send_buffer_channel_nb_fn_t) (orte_rml_channel_num_t channel, + struct opal_buffer_t * buffer, + orte_rml_tag_t tag, + orte_rml_send_buffer_channel_callback_fn_t cbfunc, + void* cbdata); + +/** + * * close a messaging channel with specified QoS to a specific peer + * + * @param[in] peer End point Peer to which the channel needs to be opened + * @param[in] channel_num The channel number returned in the channel open completion callback function. + * @param[in] cbfunc Callback function on channel close comlpetion + * @param[in] cbdata User data to provide during completion callback + * + * @retval ORTE_SUCCESS - the channel was successfully closed at the source and a request was sent to the dest. + * @retval ORTE_ERROR - unknown error + * @retval ORTE_ERROR_UNKNOWN_CHANNEL - cannot find the specified QoS channel + */ +typedef int (*orte_rml_module_close_channel_fn_t)( orte_rml_channel_num_t channel_num, + orte_rml_channel_callback_fn_t cbfunc, + void* cbdata); + /* ******************************************************************** */ @@ -474,9 +633,21 @@ struct orte_rml_module_t { /** Fault tolerance handler */ orte_rml_module_ft_event_fn_t ft_event; - + /** Purge information */ orte_rml_module_purge_fn_t purge; + + /** Open a qos messaging channel to a peer*/ + orte_rml_module_open_channel_fn_t open_channel; + + /** send a non blocking iovec message over a channel */ + orte_rml_module_send_channel_nb_fn_t send_channel_nb; + + /** send a non blocking buffer message over a channel */ + orte_rml_module_send_buffer_channel_nb_fn_t send_buffer_channel_nb; + + /** close a qos messaging channel */ + orte_rml_module_close_channel_fn_t close_channel; }; /** Convienence typedef */ typedef struct orte_rml_module_t orte_rml_module_t; diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index 8f084982e82..cba9f07fc67 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -5,17 +5,18 @@ * Copyright (c) 2004-2011 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights - * reserved. + * reserved. * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ /** @file: @@ -150,17 +151,21 @@ BEGIN_C_DECLS /* notifier support */ #define ORTE_RML_TAG_NOTIFIER_HNP 52 - -/* confirm spawn by tool */ #define ORTE_RML_TAG_CONFIRM_SPAWN 53 +/*** QOS specific RML TAGS ***/ +#define ORTE_RML_TAG_OPEN_CHANNEL_REQ 54 +#define ORTE_RML_TAG_OPEN_CHANNEL_RESP 55 +#define ORTE_RML_TAG_MSG_ACK 56 +#define ORTE_RML_TAG_CLOSE_CHANNEL_REQ 57 +#define ORTE_RML_TAG_CLOSE_CHANNEL_ACCEPT 58 #define ORTE_RML_TAG_MAX 100 #define ORTE_RML_TAG_NTOH(t) ntohl(t) #define ORTE_RML_TAG_HTON(t) htonl(t) -/** +/** * Message matching tag * * Message matching tag. Unlike MPI, there is no wildcard receive, @@ -170,6 +175,11 @@ BEGIN_C_DECLS */ typedef uint32_t orte_rml_tag_t; +/** + * Channel number + * Reference to a rml channel + */ +typedef uint32_t orte_rml_channel_num_t; /* ******************************************************************** */ diff --git a/orte/test/system/oob_stress_channel.c b/orte/test/system/oob_stress_channel.c new file mode 100644 index 00000000000..0cee39c9ed0 --- /dev/null +++ b/orte/test/system/oob_stress_channel.c @@ -0,0 +1,232 @@ +#include "orte_config.h" + +#include +#include +#include + +#include "opal/runtime/opal_progress.h" + +#include "orte/util/proc_info.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/runtime/runtime.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/qos/qos.h" +#include "orte/util/attr.h" + +#define MY_TAG 12345 +#define MAX_COUNT 3 + +static volatile bool msgs_recvd; +static volatile bool channel_inactive = false; +static volatile bool channel_active = false; +static volatile bool msg_active = false; +static volatile orte_rml_channel_num_t channel; +static volatile int num_msgs_recvd = 0; +static volatile int num_msgs_sent = 0; + +static void close_channel_callback(int status, + orte_rml_channel_num_t channel_num, + orte_process_name_t * peer, + opal_list_t *qos_attributes, + void * cbdata) +{ + if (ORTE_SUCCESS != status) + opal_output(0, "close channel not successful status =%d", status); + else + opal_output(0, "close channel successful - channel num = %d", channel_num); + channel_active = false; +} + +static void open_channel_callback(int status, + orte_rml_channel_num_t channel_num, + orte_process_name_t * peer, + opal_list_t *qos_attributes, + void * cbdata) +{ + if (ORTE_SUCCESS != status) { + opal_output(0, "open channel not successful status =%d", status); + + } else { + channel = channel_num; + opal_output(0, "Open channel successful - channel num = %d", channel_num); + + } + channel_inactive = false; +} + +static void send_callback(int status, orte_process_name_t *peer, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) + +{ + OBJ_RELEASE(buffer); + num_msgs_sent++; + if (ORTE_SUCCESS != status) { + opal_output(0, "rml_send_nb not successful status =%d", status); + } + if(num_msgs_sent == 5) + msg_active = false; +} + +static void recv_callback(int status, orte_process_name_t *sender, + opal_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata) + +{ + //orte_rml_recv_cb_t *blob = (orte_rml_recv_cb_t*)cbdata; + num_msgs_recvd++; + opal_output(0, "recv_callback received msg =%d", num_msgs_recvd); + if ( num_msgs_recvd == 5) { + num_msgs_recvd =0; + msgs_recvd = false; + + } + +} + +static void channel_send_callback (int status, orte_rml_channel_num_t channel, + opal_buffer_t * buffer, orte_rml_tag_t tag, + void *cbdata) +{ + OBJ_RELEASE(buffer); + if (ORTE_SUCCESS != status) { + opal_output(0, "send_nb_channel not successful status =%d", status); + } + msg_active = false; +} + + +int main(int argc, char *argv[]){ + int count; + int msgsize; + int *type, type_val; + int *i, j, rc, n; + orte_process_name_t peer; + double maxpower; + opal_buffer_t *buf; + orte_rml_recv_cb_t blob; + opal_list_t *qos_attributes; + int window; + uint32_t timeout = 1; + bool retry = false; + uint8_t *msg; + /* + * Init + */ + orte_init(&argc, &argv, ORTE_PROC_NON_MPI); + + if (argc > 1) { + count = atoi(argv[1]); + if (count < 0) { + count = INT_MAX-1; + } + } else { + count = MAX_COUNT; + } + + peer.jobid = ORTE_PROC_MY_NAME->jobid; + peer.vpid = ORTE_PROC_MY_NAME->vpid + 1; + if (peer.vpid == orte_process_info.num_procs) { + peer.vpid = 0; + } + type_val = orte_qos_ack; + type = &type_val; + window = 5; + count =3; + qos_attributes = OBJ_NEW (opal_list_t); + if (ORTE_SUCCESS == (rc = orte_set_attribute( qos_attributes, + ORTE_QOS_TYPE, ORTE_ATTR_GLOBAL, (void*)type, OPAL_UINT8))) { + type = &window; + if (ORTE_SUCCESS == (rc = orte_set_attribute(qos_attributes, ORTE_QOS_WINDOW_SIZE, + ORTE_ATTR_GLOBAL, (void*) type, OPAL_UINT32))) { + // orte_get_attribute( &qos_attributes, ORTE_QOS_WINDOW_SIZE, (void**)&type, OPAL_UINT32); + // opal_output(0, "%s set attribute window =%d complete \n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *type ); + type = &timeout; + orte_set_attribute (qos_attributes, ORTE_QOS_ACK_NACK_TIMEOUT, ORTE_ATTR_GLOBAL, + (void*)type, OPAL_UINT32); + orte_set_attribute (qos_attributes, ORTE_QOS_MSG_RETRY, ORTE_ATTR_GLOBAL, + NULL, OPAL_BOOL); + /* Uncomment following lines to print channel attributes */ + /* + opal_output(0, "%s set attribute retry =%d complete \n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), retry ); + orte_get_attribute( qos_attributes, ORTE_QOS_TYPE, (void**)&type, OPAL_UINT8); + opal_output(0, "%s set attribute type =%d complete \n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *type ); + orte_get_attribute( qos_attributes, ORTE_QOS_WINDOW_SIZE, (void**)&type, OPAL_UINT32); + opal_output(0, "%s set attribute window =%d complete \n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *type ) + orte_get_attribute( qos_attributes, ORTE_QOS_ACK_NACK_TIMEOUT, (void**)&type, OPAL_UINT32); + opal_output(0, "%s set attribute timeout =%d complete \n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), *type );*/ + channel_inactive = true; + orte_rml.open_channel ( &peer, qos_attributes, open_channel_callback, NULL); + opal_output(0, "%s process sent open channel request %d waiting for completion \n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); + ORTE_WAIT_FOR_COMPLETION(channel_inactive); + opal_output(0, "%s open channel complete to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&peer)); + } + } + for (j = 0; j< count; j++) + { + if (ORTE_PROC_MY_NAME->vpid == 0) + { + /* rank0 starts ring */ + msg_active = true; + for (n = 0; n< window; n++ ) + { + buf = OBJ_NEW(opal_buffer_t); + maxpower = (double)(j%7); + msgsize = (int)pow(10.0, maxpower); + opal_output(0, "Ring %d message %d size %d bytes", j,n, msgsize); + msg = (uint8_t*)malloc(msgsize); + opal_dss.pack(buf, msg, msgsize, OPAL_BYTE); + free(msg); + orte_rml.send_buffer_channel_nb(channel, buf, MY_TAG, channel_send_callback, NULL); + OBJ_CONSTRUCT(&blob, orte_rml_recv_cb_t); + blob.active = true; + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, + ORTE_RML_NON_PERSISTENT, + orte_rml_recv_callback, &blob); + ORTE_WAIT_FOR_COMPLETION(blob.active); + OBJ_DESTRUCT(&blob); + //orte_rml.send_buffer_nb(&peer, buf,MY_TAG, send_callback, NULL) + } + ORTE_WAIT_FOR_COMPLETION(msg_active); + opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); + //sleep(2); + } + else + { + msg_active = true; + for (n =0; n < window; n++) { + OBJ_CONSTRUCT(&blob, orte_rml_recv_cb_t); + blob.active = true; + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, + ORTE_RML_NON_PERSISTENT, + orte_rml_recv_callback, &blob); + ORTE_WAIT_FOR_COMPLETION(blob.active); + opal_output(0, "%s received message %d from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j, + ORTE_NAME_PRINT(&blob.name)); + /* send it along */ + buf = OBJ_NEW(opal_buffer_t); + opal_dss.copy_payload(buf, &blob.data); + OBJ_DESTRUCT(&blob); + orte_rml.send_buffer_channel_nb(channel, buf, MY_TAG, channel_send_callback, NULL); + } + ORTE_WAIT_FOR_COMPLETION(msg_active); + opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); + //sleep (2); + } + } + channel_active = true; + orte_rml.close_channel ( channel,close_channel_callback, NULL); + opal_output(0, "%s process sent close channel request waiting for completion \n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + ORTE_WAIT_FOR_COMPLETION(channel_active); + opal_output(0, "%s close channel complete to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&peer)); + orte_finalize(); + return 0; +} diff --git a/orte/util/attr.h b/orte/util/attr.h index c90c036be50..f8d6fc6aac3 100644 --- a/orte/util/attr.h +++ b/orte/util/attr.h @@ -1,9 +1,9 @@ /* - * Copyright (c) 2014 Intel, Inc. All rights reserved + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -80,7 +80,7 @@ typedef uint16_t orte_job_flags_t; #define ORTE_JOB_FLAG_DEBUGGER_DAEMON 0x0010 // job is launching debugger daemons #define ORTE_JOB_FLAG_FORWARD_OUTPUT 0x0020 // forward output from the apps #define ORTE_JOB_FLAG_DO_NOT_MONITOR 0x0040 // do not monitor apps for termination -#define ORTE_JOB_FLAG_FORWARD_COMM 0x0080 // +#define ORTE_JOB_FLAG_FORWARD_COMM 0x0080 // #define ORTE_JOB_FLAG_RECOVERABLE 0x0100 // job is recoverable #define ORTE_JOB_FLAG_RESTART 0x0200 // #define ORTE_JOB_FLAG_PROCS_MIGRATING 0x0400 // some procs in job are migrating from one node to another @@ -168,6 +168,16 @@ typedef uint16_t orte_proc_flags_t; #define ORTE_PROC_MAX_KEY 400 +/*** MESSAGING QOS ATTRIBUTE KEYS ***/ +#define ORTE_QOS_START_KEY ORTE_PROC_MAX_KEY +#define ORTE_QOS_TYPE (ORTE_QOS_START_KEY + 1) //uint8- defining what type of qos - refer to orte_qos_type enum for values +#define ORTE_QOS_WINDOW_SIZE (ORTE_QOS_START_KEY + 2) // uint32 - number of messages in the window (stream) +#define ORTE_QOS_ACK_NACK_TIMEOUT (ORTE_QOS_START_KEY + 3) //uint32 - timeout value in secs for msg/window ack nack +#define ORTE_QOS_MSG_RETRY (ORTE_QOS_START_KEY + 4) // bool- resend message upon ACK fail or NACK or timeout. +#define ORTE_QOS_NUM_RETRIES (ORTE_QOS_START_KEY + 5) // uint32 - number of retries. + +#define ORTE_QOS_MAX_KEY 500 + #define ORTE_ATTR_KEY_MAX 1000