-
Notifications
You must be signed in to change notification settings - Fork 936
Open
Milestone
Description
Thank you for taking the time to submit an issue!
Background information
What version of Open MPI are you using? (e.g., v3.0.5, v4.0.2, git branch name and hash, etc.)
$ mpiexec --version
mpiexec (Open MPI) 5.0.0
Report bugs to https://www.open-mpi.org/community/help/Describe how Open MPI was installed (e.g., from a source/distribution tarball, from a git clone, from an operating system distribution package, etc.)
curl -O https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.0.tar.bz2
tar -jxf openmpi-5.0.0.tar.bz2export PATH=/localdisk/yigoshev/mpi/openmpi-5.0.0-built/bin:$PATH
cd openmpi-5.0.0/
./configure --prefix=<path_to_ompi>
make -j44 all
pip install sphinx_rtd_theme # for some reason openmpi requires this package to install
pip install recommonmark # for some reason openmpi requires this package to install
make -j44 all
make install
export PATH=<path_to_ompi>/bin:$PATH
pip install --no-cache-dir mpi4pyIf you are building/installing from a git clone, please copy-n-paste the output from git submodule status.
Please describe the system on which you are running
- Operating system/version:
$ cat /etc/os-release
PRETTY_NAME="Ubuntu 22.04.3 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.3 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy- Computer hardware: Intel(R) Xeon(R) Platinum 8276L CPU @ 2.20GHz
- Network type: tcp
Details of the problem
Please describe, in detail, the problem that you are having, including the behavior you expect to see, the actual behavior that you are seeing, steps to reproduce the problem, etc. It is most helpful if you can attach a small program that a developer can use to reproduce your problem.
We see OpenMPI hanging on MPI_Comm_split_type while we expect it to working.
Note: If you include verbatim output (or a code block), please use a GitHub Markdown code block like below:
C example
#include <stdio.h>
#include <stdlib.h>
#include <sys/param.h>
#include <sys/types.h>
#include <unistd.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
int msg, rc;
MPI_Comm parent, child,intracomm,intercomm;
int rank, size;
char hostname[1024];
pid_t pid;
char *env_rank, *env_nspace;
MPI_Info info;
env_rank = getenv("PMIX_RANK");
env_nspace = getenv("PMIX_NAMESPACE");
pid = getpid();
gethostname(hostname, 1024);
printf("[%s:%s pid %ld] starting up on node %s!\n", env_nspace, env_rank, (long) pid, hostname);
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
printf("%d completed MPI_Init\n", rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_get_parent(&parent);
/* If we get COMM_NULL back, then we're the parent */
if (MPI_COMM_NULL == parent) {
pid = getpid();
printf("Parent [pid %ld] about to spawn!\n", (long) pid);
MPI_Info_create(&info);
rc = MPI_Comm_spawn(argv[0], MPI_ARGV_NULL, 8, info, 0, MPI_COMM_WORLD, &child,
MPI_ERRCODES_IGNORE);
MPI_Intercomm_merge(child, 0, &intracomm);
if (MPI_SUCCESS != rc) {
printf("Child failed to spawn\n");
return rc;
}
printf("Parent done with spawn\n");
MPI_Comm_disconnect(&child);
printf("Parent disconnected\n");
}
/* Otherwise, we're the child */
else {
printf("In Child process\n");
MPI_Intercomm_merge(parent, 1, &intracomm);
MPI_Comm_disconnect(&parent);
printf("Child %d disconnected\n", rank);
}
MPI_Barrier(intracomm);
MPI_Comm new_comm;
MPI_Comm_split_type(intracomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &new_comm);
MPI_Finalize();
fprintf(stderr, "%d: exiting\n", pid);
return 0;
}Python example
# reproducer.py
import sys
import mpi4py
mpi4py.rc(recv_mprobe=False, initialize=False)
from mpi4py import MPI # noqa: E402
MPI.Init_thread()
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
parent_comm = MPI.Comm.Get_parent()
if rank == 0 and parent_comm == MPI.COMM_NULL and size == 1:
nprocs_to_spawn = 8 # everything works on 128 and lower values
args = ["reproducer.py"]
info = MPI.Info.Create()
intercomm = MPI.COMM_SELF.Spawn(
sys.executable,
args,
maxprocs=nprocs_to_spawn,
info=info,
root=rank,
)
comm = intercomm.Merge(high=False)
intercomm.Disconnect()
else:
comm = parent_comm.Merge(high=True)
parent_comm.Disconnect()
comm.Barrier()
host_comm = comm.Split_type(MPI.COMM_TYPE_SHARED)
MPI.Finalize()for C example
$ mpicc reproducer.c -o reproducer
$ mpiexec -n 1 reproducerfor Python example
$ mpiexec -n 1 python reproducer.py