Skip to content

Commit 9b65ec9

Browse files
committed
sharedfp/sm and lockedfile: fix naming bug
If an application opens a file for reading from multiple processes using MPI_COMM_SELF (or another communicator that has distinct process groups but the same comm-id, as can happen as the result of comm_split), the naming chosen for the lockedfile or the mmapped file used by the sharedfp/sm component would collide. This patch ensures that the filename is different by integrating the process id of rank 0 for each sub-communicator. This fixes one aspect of the problem reported in github issue 5593 Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
1 parent 2221720 commit 9b65ec9

File tree

2 files changed

+38
-7
lines changed

2 files changed

+38
-7
lines changed

ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile_file_open.c

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <sys/stat.h>
3636
#endif
3737
#include <fcntl.h>
38+
#include <unistd.h>
3839

3940
int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
4041
const char* filename,
@@ -47,7 +48,9 @@ int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
4748
int handle;
4849
struct mca_sharedfp_lockedfile_data * module_data = NULL;
4950
struct mca_sharedfp_base_data_t* sh;
50-
51+
pid_t my_pid;
52+
int int_pid;
53+
5154
/*Memory is allocated here for the sh structure*/
5255
sh = (struct mca_sharedfp_base_data_t*)malloc(sizeof(struct mca_sharedfp_base_data_t));
5356
if ( NULL == sh){
@@ -87,16 +90,28 @@ int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
8790
return err;
8891
}
8992

90-
size_t filenamelen = strlen(filename) + 16;
93+
if ( 0 == fh->f_rank ) {
94+
my_pid = getpid();
95+
int_pid = (int) my_pid;
96+
}
97+
err = comm->c_coll->coll_bcast (&int_pid, 1, MPI_INT, 0, comm, comm->c_coll->coll_bcast_module );
98+
if ( OMPI_SUCCESS != err ) {
99+
opal_output(0, "[%d]mca_sharedfp_lockedfile_file_open: Error in bcast operation\n", fh->f_rank);
100+
free (sh);
101+
free(module_data);
102+
return err;
103+
}
104+
105+
size_t filenamelen = strlen(filename) + 24;
91106
lockedfilename = (char*)malloc(sizeof(char) * filenamelen);
92107
if ( NULL == lockedfilename ) {
93108
free (sh);
94109
free (module_data);
95110
return OMPI_ERR_OUT_OF_RESOURCE;
96111
}
97-
snprintf(lockedfilename, filenamelen, "%s-%u%s",filename,masterjobid,".lock");
112+
snprintf(lockedfilename, filenamelen, "%s-%u-%d%s",filename,masterjobid,int_pid,".lock");
98113
module_data->filename = lockedfilename;
99-
114+
100115
/*-------------------------------------------------*/
101116
/*Open the lockedfile without shared file pointer */
102117
/*-------------------------------------------------*/

ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
#include <semaphore.h>
4545
#include <sys/mman.h>
4646
#include <libgen.h>
47-
47+
#include <unistd.h>
4848

4949
int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm,
5050
const char* filename,
@@ -62,6 +62,8 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm,
6262
struct mca_sharedfp_sm_offset sm_offset;
6363
int sm_fd;
6464
uint32_t comm_cid;
65+
int int_pid;
66+
pid_t my_pid;
6567

6668
/*Memory is allocated here for the sh structure*/
6769
if ( mca_sharedfp_sm_verbose ) {
@@ -100,7 +102,7 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm,
100102
** For sharedfp we also want to put the file backed shared memory into the tmp directory
101103
*/
102104
filename_basename = basename((char*)filename);
103-
/* format is "%s/%s_cid-%d.sm", see below */
105+
/* format is "%s/%s_cid-%d-%d.sm", see below */
104106
sm_filename_length = strlen(ompi_process_info.job_session_dir) + 1 + strlen(filename_basename) + 5 + (3*sizeof(uint32_t)+1) + 4;
105107
sm_filename = (char*) malloc( sizeof(char) * sm_filename_length);
106108
if (NULL == sm_filename) {
@@ -111,7 +113,21 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm,
111113
}
112114

113115
comm_cid = ompi_comm_get_cid(comm);
114-
sprintf(sm_filename, "%s/%s_cid-%d.sm", ompi_process_info.job_session_dir, filename_basename, comm_cid);
116+
if ( 0 == fh->f_rank ) {
117+
my_pid = getpid();
118+
int_pid = (int) my_pid;
119+
}
120+
err = comm->c_coll->coll_bcast (&int_pid, 1, MPI_INT, 0, comm, comm->c_coll->coll_bcast_module );
121+
if ( OMPI_SUCCESS != err ) {
122+
opal_output(0,"mca_sharedfp_sm_file_open: Error in bcast operation \n");
123+
free(sm_filename);
124+
free(sm_data);
125+
free(sh);
126+
return err;
127+
}
128+
129+
snprintf(sm_filename, sm_filename_length, "%s/%s_cid-%d-%d.sm", ompi_process_info.job_session_dir,
130+
filename_basename, comm_cid, int_pid);
115131
/* open shared memory file, initialize to 0, map into memory */
116132
sm_fd = open(sm_filename, O_RDWR | O_CREAT,
117133
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);

0 commit comments

Comments
 (0)