Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions orte/mca/oob/usock/oob_usock_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ static void connection_event_handler(int incoming_sd, short flags, void* cbdata)
static int component_startup(void)
{
int rc=ORTE_SUCCESS;
char *session;

opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s USOCK STARTUP",
Expand All @@ -213,11 +214,18 @@ static int component_startup(void)
/* setup the path to the daemon rendezvous point */
memset(&mca_oob_usock_component.address, 0, sizeof(struct sockaddr_un));
mca_oob_usock_component.address.sun_family = AF_UNIX;
session = opal_os_path(false, orte_process_info.tmpdir_base,
orte_process_info.top_session_dir,
orte_process_info.jobfam_session_dir,
"usock", NULL);
if ((strlen(session) + 1) > sizeof(mca_oob_usock_component.address.sun_path)-1) {
opal_output(0, "SESSION DIR TOO LONG");
return ORTE_ERR_NOT_SUPPORTED;
}
snprintf(mca_oob_usock_component.address.sun_path,
sizeof(mca_oob_usock_component.address.sun_path)-1,
"%s/%s/%s/0/%s", orte_process_info.tmpdir_base,
orte_process_info.top_session_dir,
ORTE_JOB_FAMILY_PRINT(ORTE_PROC_MY_NAME->jobid), "usock");
"%s", session);
free(session);
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"SUNPATH: %s", mca_oob_usock_component.address.sun_path);

Expand All @@ -231,7 +239,7 @@ static int component_startup(void)
/* if the rendezvous point isn't there, then that's an error */
/* if the rendezvous file doesn't exist, that's an error */
if (0 != access(mca_oob_usock_component.address.sun_path, R_OK)) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"SUNPATH: %s NOT READABLE", mca_oob_usock_component.address.sun_path);
return OPAL_ERR_NOT_FOUND;
}
Expand Down
1 change: 1 addition & 0 deletions orte/mca/schizo/ompi/schizo_ompi.c
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,7 @@ static int setup_fork(orte_job_t *jdata,
/* forcibly set the local tmpdir base and top session dir to match ours */
opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env);
opal_setenv("OMPI_MCA_orte_top_session_dir", orte_process_info.top_session_dir, true, &app->env);
opal_setenv("OMPI_MCA_orte_jobfam_session_dir", orte_process_info.jobfam_session_dir, true, &app->env);

/* MPI-3 requires we provide some further info to the procs,
* so we pass them as envars to avoid introducing further
Expand Down
8 changes: 6 additions & 2 deletions orte/orted/pmix/pmix_server.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
#include "opal/util/show_help.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
#include "opal/util/os_path.h"
#include "opal/util/argv.h"

#include "orte/mca/errmgr/errmgr.h"
Expand Down Expand Up @@ -261,9 +262,12 @@ int pmix_server_init(void)
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR);
kv->type = OPAL_STRING;
kv->data.string = strdup(orte_process_info.tmpdir_base);
kv->data.string = opal_os_path(false, orte_process_info.tmpdir_base,
orte_process_info.top_session_dir,
orte_process_info.jobfam_session_dir, NULL);
opal_list_append(&info, &kv->super);
/* use the same for the system temp directory */
/* use the same for the system temp directory - this is
* where the system-level tool connections will go */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_SYSTEM_TMPDIR);
kv->type = OPAL_STRING;
Expand Down
17 changes: 16 additions & 1 deletion orte/runtime/orte_mca_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -51,6 +51,7 @@ static char *orte_tmpdir_base = NULL;
static char *orte_local_tmpdir_base = NULL;
static char *orte_remote_tmpdir_base = NULL;
static char *orte_top_session_dir = NULL;
static char *orte_jobfam_session_dir = NULL;

int orte_register_params(void)
{
Expand Down Expand Up @@ -165,6 +166,20 @@ int orte_register_params(void)
orte_process_info.top_session_dir = strdup(orte_top_session_dir);
}

orte_jobfam_session_dir = NULL;
(void) mca_base_var_register ("orte", "orte", NULL, "jobfam_session_dir",
"The jobfamily session directory for applications",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL_EQ,
&orte_jobfam_session_dir);

if (NULL != orte_jobfam_session_dir) {
if (NULL != orte_process_info.jobfam_session_dir) {
free(orte_process_info.jobfam_session_dir);
}
orte_process_info.jobfam_session_dir = strdup(orte_jobfam_session_dir);
}

orte_prohibited_session_dirs = NULL;
(void) mca_base_var_register ("orte", "orte", NULL, "no_session_dirs",
"Prohibited locations for session directories (multiple locations separated by ',', default=NULL)",
Expand Down
8 changes: 7 additions & 1 deletion orte/util/proc_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -80,6 +80,7 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
.num_local_peers = 0,
.tmpdir_base = NULL,
.top_session_dir = NULL,
.jobfam_session_dir = NULL,
.job_session_dir = NULL,
.proc_session_dir = NULL,
.sock_stdin = NULL,
Expand Down Expand Up @@ -294,6 +295,11 @@ int orte_proc_info_finalize(void)
orte_process_info.top_session_dir = NULL;
}

if (NULL != orte_process_info.jobfam_session_dir) {
free(orte_process_info.jobfam_session_dir);
orte_process_info.jobfam_session_dir = NULL;
}

if (NULL != orte_process_info.job_session_dir) {
free(orte_process_info.job_session_dir);
orte_process_info.job_session_dir = NULL;
Expand Down
3 changes: 2 additions & 1 deletion orte/util/proc_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -119,6 +119,7 @@ struct orte_proc_info_t {
*/
char *tmpdir_base; /**< Base directory of the session dir tree */
char *top_session_dir; /**< Top-most directory of the session tree */
char *jobfam_session_dir; /**< Session directory for this family of jobs (i.e., share same mpirun) */
char *job_session_dir; /**< Session directory for job */
char *proc_session_dir; /**< Session directory for the process */

Expand Down
90 changes: 51 additions & 39 deletions orte/util/session_dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,16 @@ static int orte_create_dir(char *directory)

/*
* Construct the fullpath to the session directory - it
* will consist of "ompi.<hostname>.<pid>"
* will consist of "ompi.<hostname>.<effective-uid>", and
* have subdirs:
*
* pid - the pid of the mpirun that oversees this job. Note
* that direct-launched processes will have manufactured
* this value
*
* jobid - jobid of the application being executed
*
* vpid - vpid of the process
*/
int
orte_session_dir_get_name(char **fulldirpath,
Expand All @@ -132,10 +141,14 @@ orte_session_dir_get_name(char **fulldirpath,
bool prefix_provided = false;
int exit_status = ORTE_SUCCESS;
size_t len;
uid_t uid;

/* Ensure that system info is set */
orte_proc_info();

/* get the effective uid */
uid = geteuid();

/*
* set the 'hostname'
*/
Expand All @@ -156,30 +169,48 @@ orte_session_dir_get_name(char **fulldirpath,
/* construct the frontend of the session directory*/
if (NULL != orte_process_info.top_session_dir) {
frontend = strdup(orte_process_info.top_session_dir);
} else { /* If not set then construct it */
if (0 > asprintf(&frontend, "ompi.%s.%lu", hostname, (unsigned long)uid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
}
else { /* If not set then construct it */
if (0 > asprintf(&frontend, "ompi.%s.%lu", hostname, (unsigned long)orte_process_info.pid)) {

/* construct the next level down, which belongs to the
* job family. This is related to the mpirun that launched
* the job, or is an arbitrary (agreed upon) value if
* direct launched */
if (ORTE_PROC_IS_HNP) {
if (0 > asprintf(&jobfam, "pid.%lu", (unsigned long)orte_process_info.pid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
orte_process_info.jobfam_session_dir = strdup(jobfam);
} else if (NULL != orte_process_info.jobfam_session_dir) {
/* we had a job family session dir passed down to us by mpirun */
jobfam = strdup(orte_process_info.jobfam_session_dir);
} else {
/* we were not given one, so define it */
if (NULL == proc) {
jobfam = strdup("jobfam");
} else {
if (0 > asprintf(&jobfam, "jf.%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
}
orte_process_info.jobfam_session_dir = strdup(jobfam);
}

/*
* Construct the session directory
*/
/* If we were given a valid vpid then we can construct it fully into:
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID
*/
/* If we were given a valid vpid then we can construct it fully */
if( NULL != proc) {
if (ORTE_VPID_INVALID != proc->vpid) {

if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}

if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
Expand All @@ -192,23 +223,13 @@ orte_session_dir_get_name(char **fulldirpath,
goto cleanup;
}

sessions = opal_os_path( false, frontend, jobfam, job, vpidstr, NULL );
sessions = opal_os_path(false, frontend, jobfam, job, vpidstr, NULL);
if( NULL == sessions ) {
ORTE_ERROR_LOG(ORTE_ERROR);
exit_status = ORTE_ERROR;
goto cleanup;
}
}
/* If we were given a valid jobid then we can construct it partially into:
* openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID
*/
else if (ORTE_JOBID_INVALID != proc->jobid) {
if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
goto cleanup;
}

} else if (ORTE_JOBID_INVALID != proc->jobid) {
if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
exit_status = ORTE_ERR_OUT_OF_RESOURCE;
Expand All @@ -221,14 +242,12 @@ orte_session_dir_get_name(char **fulldirpath,
exit_status = ORTE_ERROR;
goto cleanup;
}
} /* if both are invalid */
else {
} else {
sessions = strdup(frontend); /* must dup this to avoid double-free later */
}

} /* If we were not given a proc at all, then we just set it to frontend
*/
else {
} else {
/* If we were not given a proc at all, then we just set it to frontend */
sessions = strdup(frontend); /* must dup this to avoid double-free later */
}

Expand Down Expand Up @@ -666,14 +685,8 @@ static char *orte_build_job_session_dir(char *top_dir,
orte_process_name_t *proc,
orte_jobid_t jobid)
{
char *jobfam = NULL;
char *job_session_dir;

if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
}

if (ORTE_JOBID_WILDCARD != jobid) {
char *job = NULL;

Expand All @@ -682,19 +695,18 @@ static char *orte_build_job_session_dir(char *top_dir,
job_session_dir = NULL;
goto out;
}
job_session_dir = opal_os_path(false, top_dir, jobfam, job, NULL);
job_session_dir = opal_os_path(false, top_dir, orte_process_info.jobfam_session_dir, job, NULL);
free(job);
if (NULL == job_session_dir) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
}
} else {
job_session_dir = opal_os_path(false, top_dir, jobfam, NULL);
job_session_dir = opal_os_path(false, top_dir, orte_process_info.jobfam_session_dir, NULL);
if( NULL == job_session_dir) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
}
}

out:
free(jobfam);
return job_session_dir;
}