From 6fd5b7a4745665316528cc7d050ed75f0b18c944 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Sat, 4 Mar 2017 17:35:54 -0600 Subject: [PATCH] sharedfp/lockedfile and sm: fix name collision this fixes the issue reported by Nicolas Joly on the mailing: the sharedfp/lockedfile component does not support right now a scenario where multiple jobs read from the same input file, due to a collision of the filenames utilized for the sharedfp handle. Although not part of the oroginal report, the same occurs for the sharedfp/sm component. Add therefore the jobid to be part of the lockedfilename/sm file name. use the OMPI_CAST_RTE_NAME macro to determine jobid Fixes: #3098 Signed-off-by: Edgar Gabriel --- .../sharedfp_lockedfile_file_open.c | 23 ++++++++++++++++--- ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c | 14 +++++++++-- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile_file_open.c b/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile_file_open.c index 5c08b6a94e0..c40348d848b 100644 --- a/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile_file_open.c +++ b/ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile_file_open.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013-2015 University of Houston. All rights reserved. + * Copyright (c) 2013-2017 University of Houston. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -25,6 +25,8 @@ #include "mpi.h" #include "ompi/constants.h" +#include "ompi/group/group.h" +#include "ompi/proc/proc.h" #include "ompi/mca/sharedfp/sharedfp.h" #include "ompi/mca/sharedfp/base/base.h" @@ -99,8 +101,23 @@ int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm, return OMPI_ERR_OUT_OF_RESOURCE; } - lockedfilename = (char*)malloc(sizeof(char) * (strlen(filename) + 64)); - sprintf(lockedfilename,"%s%s",filename,".lockedfile"); + opal_jobid_t masterjobid; + if ( 0 == comm->c_my_rank ) { + ompi_proc_t *masterproc = ompi_group_peer_lookup(comm->c_local_group, 0 ); + masterjobid = OMPI_CAST_RTE_NAME(&masterproc->super.proc_name)->jobid; + } + comm->c_coll.coll_bcast ( &masterjobid, 1, MPI_UNSIGNED, 0, comm, + comm->c_coll.coll_bcast_module ); + + size_t filenamelen = strlen(filename) + 16; + lockedfilename = (char*)malloc(sizeof(char) * filenamelen); + if ( NULL == lockedfilename ) { + free (shfileHandle); + free (sh); + free (module_data); + return OMPI_ERR_OUT_OF_RESOURCE; + } + snprintf(lockedfilename, filenamelen, "%s-%u%s",filename,masterjobid,".lock"); module_data->filename = lockedfilename; /*-------------------------------------------------*/ diff --git a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c index da4ff93dbcc..0c713b393d5 100644 --- a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c +++ b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013-2015 University of Houston. All rights reserved. + * Copyright (c) 2013-2017 University of Houston. All rights reserved. * Copyright (c) 2013 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -35,6 +35,8 @@ #include "mpi.h" #include "ompi/constants.h" +#include "ompi/group/group.h" +#include "ompi/proc/proc.h" #include "ompi/mca/sharedfp/sharedfp.h" #include "ompi/mca/sharedfp/base/base.h" @@ -139,8 +141,16 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, free(shfileHandle); return OMPI_ERR_OUT_OF_RESOURCE; } - sprintf(sm_filename,"/tmp/OMPIO_sharedfp_sm_%s%s",filename_basename,".sm"); + opal_jobid_t masterjobid; + if ( 0 == comm->c_my_rank ) { + ompi_proc_t *masterproc = ompi_group_peer_lookup(comm->c_local_group, 0 ); + masterjobid = OMPI_CAST_RTE_NAME(&masterproc->super.proc_name)->jobid; + } + comm->c_coll.coll_bcast ( &masterjobid, 1, MPI_UNSIGNED, 0, comm, + comm->c_coll.coll_bcast_module ); + + sprintf(sm_filename,"/tmp/OMPIO_%s_%d_%s",filename_basename, masterjobid, ".sm"); /* open shared memory file, initialize to 0, map into memory */ sm_fd = open(sm_filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);