In [1]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

# @noautodeps
# pyre-ignore-all-errors
import json
import logging
import socket
import sys

import cloudpickle
from example_actors.compute_world_size_actor import ComputeWorldSizeActor
from monarch.actor import Actor, endpoint
from slurm.utils import create_slurm_job, cleanup_job


logging.basicConfig(
    level=logging.INFO,
    format="%(name)s %(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    force=True,
)


logger: logging.Logger = logging.getLogger(__name__)

class _HostnameActor(Actor):
           """Helper actor to get hostname from rank 0"""
           @endpoint
           def get_hostname(self) -> str:
               return socket.gethostname()


async def main():
    num_nodes = 2
    gpus_per_node = 4
    mesh_name = "mesh0"
    master_port = 29500

    # Create SLURM job
    slurm_job = create_slurm_job(mesh_name, num_nodes, gpus_per_node)

    try:
        # Get job state and create process mesh
        job_state = slurm_job.state()
        proc_mesh = job_state.mesh0.spawn_procs({"gpus": gpus_per_node})

        # Get master_addr from rank 0
        hostname_actor = proc_mesh.spawn("hostname_actor", _HostnameActor)
        hostname_values = await hostname_actor.flatten("rank").slice(rank=0).get_hostname.call()
        master_addr = hostname_values.item()

        # Spawn actor
        actor = proc_mesh.spawn("compute_world_size_actor", ComputeWorldSizeActor)

        logger.info("computing world size...")
        values = await actor.compute_world_size.call(
            master_addr=master_addr,
            master_port=master_port,
        )

        values_by_rank = {f"rank_{p.rank}": v for p, v in list(values.flatten("rank"))}

        logger.info(
            f"""computed world_sizes:
    {'-'*40}
    {json.dumps(values_by_rank, indent=2)}
    {'-'*40}"""
        )
    finally:
        await cleanup_job(slurm_job)


if __name__ == "__main__":
    cloudpickle.register_pickle_by_value(sys.modules[ComputeWorldSizeActor.__module__])
    cloudpickle.register_pickle_by_value(sys.modules[_HostnameActor.__module__])

    await main()

Found cached job at path: .monarch/job_state.pkl
Error checking job 7757 status: slurm_load_jobs error: Invalid job id specified

SLURM job 7757 not found in queue
Cached job cannot run this spec, removing cache
Cancelled SLURM job 7757
Applying current job
Submitting SLURM job with 2 nodes
SLURM job 7758 submitted. Logs will be written to: /home/mreso/monarch/examples/slurm_7758_monarch_example_1784833.out
Saving job to cache at .monarch/job_state.pkl
Job has started, connecting to current state
SLURM job 7758 is running on 2 nodes: ['slurm-compute-node-090', 'slurm-compute-node-091']
__main__ 2025-11-15 01:12:07 INFO computing world size...


[36m>>> Aggregated Logs (2025-11-15 01:12:06) >>>[0m
[33m[1 similar log lines][0m [7] Initializing process group `nccl`:
[33m[1 similar log lines][0m [7]   MASTER_ADDR = slurm-compute-node-090
[33m[1 similar log lines][0m [7]   MASTER_PORT = 29500
[33m[1 similar log lines][0m [7]   RANK        = 7
[33m[1 similar log lines][0m [7]   WORLD_SIZE  = 8
[36m<<< Aggregated Logs (2025-11-15 01:12:09) <<<[0m

[36m>>> Aggregated Logs (2025-11-15 01:12:09) >>>[0m
[33m[7 similar log lines][0m [4] Initializing process group `nccl`:
[33m[7 similar log lines][0m [4]   MASTER_ADDR = slurm-compute-node-090
[33m[7 similar log lines][0m [4]   MASTER_PORT = 29500
[33m[7 similar log lines][0m [4]   RANK        = 4
[33m[7 similar log lines][0m [4]   WORLD_SIZE  = 8
[36m<<< Aggregated Logs (2025-11-15 01:12:12) <<<[0m



__main__ 2025-11-15 01:12:13 INFO computed world_sizes:
    ----------------------------------------
    {
  "rank_0": 8,
  "rank_1": 8,
  "rank_2": 8,
  "rank_3": 8,
  "rank_4": 8,
  "rank_5": 8,
  "rank_6": 8,
  "rank_7": 8
}
    ----------------------------------------
Cancelled SLURM job 7758
slurm.utils 2025-11-15 01:12:13 INFO Job terminated successfully
