In [3]:
from lightning_sdk import Machine, MMT, Studio

In [None]:
from utils.master_node import MasterNodeServer
private_master_host_ip_address = MasterNodeServer.get_master_ip()
public_master_host_ip_address = MasterNodeServer.get_master_public_ip_curl()
public_master_host_ip_address_services = MasterNodeServer.get_master_public_ip()
print(f"private_master_host_ip_address = {private_master_host_ip_address}")
print(f"public_master_host_ip_address = {public_master_host_ip_address}")
print(f"public_master_host_ip_address = {public_master_host_ip_address_services}")

10.192.12.177


In [None]:
# Configuration
NUM_NODES = 2
NUM_GPUS = 8
TEAMSPACE = "general"  # Replace with your teamspace
USER = "meta-ai"  # Replace with your username
MONARCH_DEFAULT_PORT = 26600 # Monarch default port
HTTP_SERVER_PORT = MONARCH_DEFAULT_PORT # 8080 # HTTP Server PORT for IP registration

In [None]:
def launch_mmt_job(num_nodes=2, teamspace="my-teamspace", user="my-user"):
    """
    Launch a multi-machine training job using Lightning SDK's MMT API.
    """

    studio = Studio()

    # Install the MMT plugin befor running the actual job
    studio.install_plugin("multi-machine-training")

    print(f"Launching MMT job with {num_nodes} nodes...")

    # Machine with T4 GPUs
    # machine_type = getattr(Machine, f"T4_X_{NUM_GPUS}")

    # Machine with L40S GPUs
    machine_type = getattr(Machine, f"L40S_X_{NUM_GPUS}")

    job = MMT.run(
        command=f"python example/utils/worker_node.py {public_master_host_ip_address} {HTTP_SERVER_PORT} && sleep 10 && process_allocator",
        name="Multi-Node-Monarch-Titan",
        # machine=Machine.T4_X_4,  # Use GPU machines for training
        machine=machine_type,
        studio=studio,
        num_machines=num_nodes,
        env={
            "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",  # Make all GPUs visible # TODO: Should make this one dynamic
        },
    )

    print(f"Job started with ID: {job.name}")
    print(f"Job status: {job.status}")

    # Monitor job status
    return job, studio

In [7]:
# Launch the job
job, studio = launch_mmt_job(
    num_nodes=NUM_NODES, teamspace=TEAMSPACE, user=USER
)

print(f"Job launched. You can monitor it using: job.status")
print(f"To stop the job: job.stop()")
print(f"To clean up: studio.stop()")

Launching MMT job with 2 nodes...


INFO - Multi-Machine Job was successfully launched. View it at https://lightning.ai/meta-ai/general/jobs/Multi-Node-Monarch-Titan?app_id=mmt


Job started with ID: Multi-Node-Monarch-Titan
Job status: Pending
Job launched. You can monitor it using: job.status
To stop the job: job.stop()
To clean up: studio.stop()


In [None]:
from utils.master_node import run_master_server
cluster_info = run_master_server(expected_workers=NUM_NODES, port=HTTP_SERVER_PORT)

Master node IP: 10.192.12.177
Expecting 2 worker nodes to register...
Starting server on port 8080...
Waiting for workers... (0/2 registered) - Elapsed: 0s
Server started on 10.192.12.177:8080
Waiting for workers... (0/2 registered) - Elapsed: 30s
Waiting for workers... (0/2 registered) - Elapsed: 60s
Waiting for workers... (0/2 registered) - Elapsed: 90s
Waiting for workers... (0/2 registered) - Elapsed: 120s
Waiting for workers... (0/2 registered) - Elapsed: 150s
Waiting for workers... (0/2 registered) - Elapsed: 180s
Waiting for workers... (0/2 registered) - Elapsed: 210s
Waiting for workers... (0/2 registered) - Elapsed: 240s
Waiting for workers... (0/2 registered) - Elapsed: 270s
Waiting for workers... (0/2 registered) - Elapsed: 300s
Waiting for workers... (0/2 registered) - Elapsed: 330s
Waiting for workers... (0/2 registered) - Elapsed: 360s
Waiting for workers... (0/2 registered) - Elapsed: 390s
Waiting for workers... (0/2 registered) - Elapsed: 420s
Waiting for workers... (0/

10.192.12.52 - - [19/Sep/2025 03:51:42] "POST /register HTTP/1.1" 200 -


Registered worker node: 10.192.12.72 (2/2)
All worker nodes registered!
Registration server stopped
Final registered worker nodes: ['10.192.12.52', '10.192.12.72']
Worker IPs saved to /tmp/worker_nodes.txt
Cluster info saved to /tmp/cluster_info.json


10.192.12.72 - - [19/Sep/2025 03:51:44] "POST /register HTTP/1.1" 200 -


In [9]:
from utils.ip_utils import extract_ips_simple
worker_nodes_ip_file_path = "/tmp/worker_nodes.txt"
ip_addresses_set = extract_ips_simple(worker_nodes_ip_file_path)
ip_addresses_list = list(ip_addresses_set)
print(ip_addresses_list)

Extracted IP addresses:
10.192.12.52
10.192.12.72

IP set: {'10.192.12.72', '10.192.12.52'}
['10.192.12.72', '10.192.12.52']


In [10]:
tcp_addresses = [f"tcp!{ip}:26600" for ip in ip_addresses_set]

# # Or if you want to test it locally first on the local machine uncomment line below:
# tcp_addresses = ["tcp![::]:26600"]
# # For the local host machine only, please make sure that NUM_NODES is equal to 1;
# NUM_NODES = 1

print(*tcp_addresses)

tcp!10.192.12.72:26600 tcp!10.192.12.52:26600


# Example 1 - Run TorchTitan using Monarch for Llama 3 - 8B

In [12]:
from monarch._src.actor.allocator import RemoteAllocator, StaticRemoteAllocInitializer
from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
from monarch.actor import ProcMesh

allocator = RemoteAllocator(
        world_id="foo",
        initializer=StaticRemoteAllocInitializer(*tcp_addresses),
    )

alloc = allocator.allocate(
        AllocSpec(AllocConstraints(), hosts=NUM_NODES, gpus=NUM_GPUS)
    )

proc_mesh = await ProcMesh.from_alloc(alloc)

In [13]:
import getpass
def get_job_name(num_hosts: int, num_gpus_per_host: int):
    return f"monarch-{getpass.getuser()}-hosts{num_hosts}-gpus{num_gpus_per_host}"
print(get_job_name(num_hosts=NUM_NODES, num_gpus_per_host=NUM_GPUS))

monarch-alisol-hosts2-gpus8


In [14]:
import os
import sys
import logging
from monarch.actor import ProcMesh, Actor, endpoint, current_rank
import socket
from torchtitan.tools.logging import init_logger, logger
from torchtitan.train import Trainer
from typing import Optional
import torch
from torchtitan.config import JobConfig


class TitanTrainerWrapper(Actor):
    def __init__(self, job_config: JobConfig):
        self.rank = current_rank().rank
        self.job_config = job_config

    def _rprint(self, msg):
        """Helper method to print with rank information."""
        print(f"{self.rank=} {msg}")

    @endpoint
    def init(self):
        logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))
        print(f"Initializing actor: {self.rank} {current_rank()=} {socket.gethostname()=}")


    @endpoint
    def train(self):
        logger.info("Starting training")
        config = self.job_config
        trainer: Optional[Trainer] = None

        try:
            trainer = Trainer(config)
            trainer.train()

            if config.checkpoint.create_seed_checkpoint:
                assert (
                    int(os.environ["WORLD_SIZE"]) == 1
                ), "Must create seed checkpoint using a single device, to disable sharding."
                assert (
                    # config.checkpoint.enable_checkpoint
                    config.checkpoint.enable
                ), "Must enable checkpointing when creating a seed checkpoint."
                trainer.checkpointer.save(curr_step=0, )
                logger.info("Created seed checkpoint")
            else:
                trainer.train()
        finally:
            if trainer:
                trainer.close()

            if torch.distributed.is_initialized():
                torch.distributed.destroy_process_group()
                logger.info("Process group destroyed.")
        print("Done training")

In [15]:
from torch.xpu import stream
from torchtitan.config import ConfigManager, JobConfig
from monarch.utils import setup_env_for_distributed

async def async_main(job_config: JobConfig):
    torch.use_deterministic_algorithms(True)
    job_name = get_job_name(NUM_NODES, NUM_GPUS)

    await setup_env_for_distributed(proc_mesh)

    await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)

    print(job_config)
    print(f"Spawning meshes on {job_name}")

    trainer_actor = await proc_mesh.spawn("trainer_actor", TitanTrainerWrapper, job_config)
    await trainer_actor.init.call()
    await trainer_actor.train.call()

In [16]:
init_logger()
config_manager = ConfigManager()

job_name = get_job_name(NUM_NODES, NUM_GPUS)

manual_args = [
        "--job.config_file",
        os.path.expanduser("/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml"),
        "--model.tokenizer-path",
        # f"{FUSE_DST}/Llama-3.1-8B",
        "/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B",
        "--training.steps",
        "25",
        "--training.dataset_path",
        # f"{FUSE_DST}/c4",
        "/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test",
        "--job.dump_folder",
        # f"{FUSE_DST}/outputs/" + job_name,
        "/teamspace/studios/this_studio/torchtitan/outputs/" + job_name
    ]
config = config_manager.parse_args(manual_args)
await async_main(config)



JobConfig(job=Job(config_file='/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml', dump_folder='/teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8', description='Llama 3 8B training', print_args=False), profiling=Profiling(enable_profiling=True, save_traces_folder='profile_trace', profile_freq=100, enable_memory_snapshot=False, save_memory_snapshot_folder='memory_snapshot'), metrics=Metrics(log_freq=1, enable_tensorboard=True, disable_color_printing=False, save_tb_folder='tb', save_for_all_ranks=False, enable_wandb=True), model=Model(name='llama3', flavor='8B', hf_assets_path='/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B', tokenizer_path='/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B', converters=[], print_after_conversion=False), optimizer=Optimizer(name='AdamW', lr=0.0003, beta1=0.9, beta2=0.95, eps=1e-08, weight_decay=0.1, implementation='fused', early_step_in_backward=Fal

[36m>>> Aggregated Logs (2025-09-19 03:54:11) >>>[0m
[33m[16 similar log lines][0m Starting training
[33m[16 similar log lines][0m Starting job: Llama 3 8B training
[33m[15 similar log lines][0m [W919 03:54:48.025441173 socket.cpp:767] [c10d] The client socket has failed to connect to [ip-10-192-12-72]:50717 (errno: 22 - Invalid argument).
[33m[16 similar log lines][0m Building 1-D device mesh with ['dp_shard'], [16]
[33m[16 similar log lines][0m [GC] Initial GC collection 0.00 seconds
[33m[16 similar log lines][0m Loading tokenizer from tokenizer.json
[36m<<< Aggregated Logs (2025-09-19 03:54:49) <<<[0m



[36m>>> Aggregated Logs (2025-09-19 03:54:46) >>>[0m
[33m[15 similar log lines][0m Initializing actor: 5 current_rank()={'hosts': 0/2, 'gpus': 5/8} socket.gethostname()='ip-10-192-12-72'
[36m<<< Aggregated Logs (2025-09-19 03:54:49) <<<[0m



[36m>>> Aggregated Logs (2025-09-19 03:54:49) >>>[0m
[33m[16 similar log lines][0m Preparing c4_test dataset from /teamspace/studios/this_studio/torchtitan/tests/assets/c4_test
[33m[16 similar log lines][0m Building llama3 8B with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)
[33m[16 similar log lines][0m CUDA capacity: NVIDIA L40S with 44.64GiB memory
[33m[31 similar log lines][0m Peak flops undefined for: NVIDIA L40S, fallback to A100
[33m[16 similar log lines][0m [34mModel llama3 8B [31msize: 8,030,261,248 total parameters[39m
[33m[16 similar log lines][0m Applied selective activation checkpointing to the model
[33m[16 similar log lines][0m Applied FSDP to the model
[33m[15 similar log li

[36m>>> Aggregated Logs (2025-09-19 03:54:49) >>>[0m
[33m[14 similar log lines][0m Done training
[36m<<< Aggregated Logs (2025-09-19 04:02:08) <<<[0m



[36m>>> Aggregated Logs (2025-09-19 04:02:08) >>>[0m
[33m[1 similar log lines][0m Training completed
[33m[1 similar log lines][0m wandb: updating run metadata
[33m[1 similar log lines][0m wandb:                                                                                
[33m[3 similar log lines][0m wandb: 
[33m[1 similar log lines][0m wandb: Run history:
[33m[1 similar log lines][0m wandb:                    grad_norm ▁▁█▇▂▄▂▂▃▂▁▅▆▆▃▁▃▁▁▂▂▂▁▁▁
[33m[1 similar log lines][0m wandb: loss_metrics/global_avg_loss ▇▆▇█▆▆▆▅▅▄▃▃▅▄▂▂▂▁▁▁▃▁▁▁▁
[33m[1 similar log lines][0m wandb: loss_metrics/global_max_loss ▅▄▆▆▅▅▅▃▄▃▂▃▄▃▂▂▂▁▂▂█▂▂▁▁
[33m[1 similar log lines][0m wandb:                           lr ▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇██
[33m[4 similar log lines][0m wandb:         memory/max_active(%) ▁████████████████████████
[33m[1 similar log lines][0m wandb:     memory/num_alloc_retries ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[33m[1 similar log lines][0m wandb:              memory/num_ooms ▁▁

[36m>>> Aggregated Logs (2025-09-19 04:02:08) >>>[0m
[33m[2 similar log lines][0m Done training
[36m<<< Aggregated Logs (2025-09-19 04:02:11) <<<[0m



In [22]:
proc_mesh.stop()

<monarch._src.actor.future.Future at 0x70673f3e6950>