@@ -114,7 +114,16 @@ static int orte_create_dir(char *directory)
114114
115115/*
116116 * Construct the fullpath to the session directory - it
117- * will consist of "ompi.<hostname>.<pid>"
117+ * will consist of "ompi.<hostname>.<effective-uid>", and
118+ * have subdirs:
119+ *
120+ * pid - the pid of the mpirun that oversees this job. Note
121+ * that direct-launched processes will have manufactured
122+ * this value
123+ *
124+ * jobid - jobid of the application being executed
125+ *
126+ * vpid - vpid of the process
118127 */
119128int
120129orte_session_dir_get_name (char * * fulldirpath ,
@@ -132,10 +141,14 @@ orte_session_dir_get_name(char **fulldirpath,
132141 bool prefix_provided = false;
133142 int exit_status = ORTE_SUCCESS ;
134143 size_t len ;
144+ uid_t uid ;
135145
136146 /* Ensure that system info is set */
137147 orte_proc_info ();
138148
149+ /* get the effective uid */
150+ uid = geteuid ();
151+
139152 /*
140153 * set the 'hostname'
141154 */
@@ -156,30 +169,48 @@ orte_session_dir_get_name(char **fulldirpath,
156169 /* construct the frontend of the session directory*/
157170 if (NULL != orte_process_info .top_session_dir ) {
158171 frontend = strdup (orte_process_info .top_session_dir );
172+ } else { /* If not set then construct it */
173+ if (0 > asprintf (& frontend , "ompi.%s.%lu" , hostname , (unsigned long )uid )) {
174+ ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
175+ exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
176+ goto cleanup ;
177+ }
159178 }
160- else { /* If not set then construct it */
161- if (0 > asprintf (& frontend , "ompi.%s.%lu" , hostname , (unsigned long )orte_process_info .pid )) {
179+
180+ /* construct the next level down, which belongs to the
181+ * job family. This is related to the mpirun that launched
182+ * the job, or is an arbitrary (agreed upon) value if
183+ * direct launched */
184+ if (ORTE_PROC_IS_HNP ) {
185+ if (0 > asprintf (& jobfam , "pid.%lu" , (unsigned long )orte_process_info .pid )) {
162186 ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
163187 exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
164188 goto cleanup ;
165189 }
190+ orte_process_info .jobfam_session_dir = strdup (jobfam );
191+ } else if (NULL != orte_process_info .jobfam_session_dir ) {
192+ /* we had a job family session dir passed down to us by mpirun */
193+ jobfam = strdup (orte_process_info .jobfam_session_dir );
194+ } else {
195+ /* we were not given one, so define it */
196+ if (NULL == proc ) {
197+ jobfam = strdup ("jobfam" );
198+ } else {
199+ if (0 > asprintf (& jobfam , "jf.%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
200+ ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
201+ exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
202+ goto cleanup ;
203+ }
204+ }
205+ orte_process_info .jobfam_session_dir = strdup (jobfam );
166206 }
167207
168208 /*
169209 * Construct the session directory
170210 */
171- /* If we were given a valid vpid then we can construct it fully into:
172- * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID
173- */
211+ /* If we were given a valid vpid then we can construct it fully */
174212 if ( NULL != proc ) {
175213 if (ORTE_VPID_INVALID != proc -> vpid ) {
176-
177- if (0 > asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
178- ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
179- exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
180- goto cleanup ;
181- }
182-
183214 if (0 > asprintf (& job , "%d" , ORTE_LOCAL_JOBID (proc -> jobid ))) {
184215 ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
185216 exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
@@ -192,23 +223,13 @@ orte_session_dir_get_name(char **fulldirpath,
192223 goto cleanup ;
193224 }
194225
195- sessions = opal_os_path ( false, frontend , jobfam , job , vpidstr , NULL );
226+ sessions = opal_os_path (false, frontend , jobfam , job , vpidstr , NULL );
196227 if ( NULL == sessions ) {
197228 ORTE_ERROR_LOG (ORTE_ERROR );
198229 exit_status = ORTE_ERROR ;
199230 goto cleanup ;
200231 }
201- }
202- /* If we were given a valid jobid then we can construct it partially into:
203- * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID
204- */
205- else if (ORTE_JOBID_INVALID != proc -> jobid ) {
206- if (0 > asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
207- ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
208- exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
209- goto cleanup ;
210- }
211-
232+ } else if (ORTE_JOBID_INVALID != proc -> jobid ) {
212233 if (0 > asprintf (& job , "%d" , ORTE_LOCAL_JOBID (proc -> jobid ))) {
213234 ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
214235 exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
@@ -221,14 +242,12 @@ orte_session_dir_get_name(char **fulldirpath,
221242 exit_status = ORTE_ERROR ;
222243 goto cleanup ;
223244 }
224- } /* if both are invalid */
225- else {
245+ } else {
226246 sessions = strdup (frontend ); /* must dup this to avoid double-free later */
227247 }
228248
229- } /* If we were not given a proc at all, then we just set it to frontend
230- */
231- else {
249+ } else {
250+ /* If we were not given a proc at all, then we just set it to frontend */
232251 sessions = strdup (frontend ); /* must dup this to avoid double-free later */
233252 }
234253
@@ -666,14 +685,8 @@ static char *orte_build_job_session_dir(char *top_dir,
666685 orte_process_name_t * proc ,
667686 orte_jobid_t jobid )
668687{
669- char * jobfam = NULL ;
670688 char * job_session_dir ;
671689
672- if (0 > asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
673- ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
674- return NULL ;
675- }
676-
677690 if (ORTE_JOBID_WILDCARD != jobid ) {
678691 char * job = NULL ;
679692
@@ -682,19 +695,18 @@ static char *orte_build_job_session_dir(char *top_dir,
682695 job_session_dir = NULL ;
683696 goto out ;
684697 }
685- job_session_dir = opal_os_path (false, top_dir , jobfam , job , NULL );
698+ job_session_dir = opal_os_path (false, top_dir , orte_process_info . jobfam_session_dir , job , NULL );
686699 free (job );
687700 if (NULL == job_session_dir ) {
688701 ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
689702 }
690703 } else {
691- job_session_dir = opal_os_path (false, top_dir , jobfam , NULL );
704+ job_session_dir = opal_os_path (false, top_dir , orte_process_info . jobfam_session_dir , NULL );
692705 if ( NULL == job_session_dir ) {
693706 ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
694707 }
695708 }
696709
697710out :
698- free (jobfam );
699711 return job_session_dir ;
700712}
0 commit comments