From adf489e8d32c58871460a2d79ad15a0d2bec697d Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Wed, 12 Nov 2025 10:35:17 -0800 Subject: [PATCH 1/3] Remove _init_dist() method - now handled by provisioner The _init_dist() method was setting up PyTorch distributed environment variables manually. This is no longer needed because the provisioner's get_proc_mesh() method now properly calls setup_env_for_distributed() from Monarch, which handles all distributed environment setup. This simplifies the code and removes redundant initialization. --- apps/sft/main.py | 52 ++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/apps/sft/main.py b/apps/sft/main.py index 93ba05eed..aad84283b 100644 --- a/apps/sft/main.py +++ b/apps/sft/main.py @@ -81,37 +81,37 @@ def __init__(self, config: DictConfig): self.gradient_accumulation_steps = 1 # Example value, adjust as needed self._rank = current_rank().rank self._size = math.prod(current_size().values()) - self._init_dist() + # self._init_dist() super().__init__(job_config) - def _init_dist(self): - """Initializes torch distributed. - - torchrun normally hands this, but we need to do it ourselves - in monarch for now. - - We should consider putting this into ForgeActor, but having this - be explicit for now. - - """ - env = { - "RANK": str(self._rank), - "LOCAL_RANK": str(self._rank), - "LOCAL_WORLD_SIZE": str(self._size), - "GROUP_RANK": str(self._size), - "GROUP_WORLD_SIZE": str(self._size), - "ROLE_RANK": str(self._rank), - "ROLE_WORLD_SIZE": str(self._size), - "ROLE_NAME": "rank", - "WORLD_SIZE": str(self._size), - "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", - } - os.environ.update(env) - logger.info("env: {}".format(env)) + # def _init_dist(self): + # """Initializes torch distributed. + + # torchrun normally hands this, but we need to do it ourselves + # in monarch for now. + + # We should consider putting this into ForgeActor, but having this + # be explicit for now. + + # """ + # env = { + # "RANK": str(self._rank), + # "LOCAL_RANK": str(self._rank), + # "LOCAL_WORLD_SIZE": str(self._size), + # "GROUP_RANK": str(self._size), + # "GROUP_WORLD_SIZE": str(self._size), + # "ROLE_RANK": str(self._rank), + # "ROLE_WORLD_SIZE": str(self._size), + # "ROLE_NAME": "rank", + # "WORLD_SIZE": str(self._size), + # "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + # } + # os.environ.update(env) + # logger.info("env: {}".format(env)) async def setup_metric_logger(self): """Initialization happens in the main process. Here we just retrieve it""" - mlogger = await get_or_create_metric_logger() + mlogger: GlobalLoggingActor = await get_or_create_metric_logger() return mlogger def record_batch_metrics(self, data_metrics: list): From 35d580cca5fa1bd575b503e10f7b31a15ba7377c Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Wed, 12 Nov 2025 10:39:11 -0800 Subject: [PATCH 2/3] Clean up: completely remove commented _init_dist() method Previously the method was just commented out. This commit fully removes it since the provisioner now handles all distributed environment setup via setup_env_for_distributed(). This makes the code cleaner and easier to maintain. --- apps/sft/main.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/apps/sft/main.py b/apps/sft/main.py index aad84283b..b9634392a 100644 --- a/apps/sft/main.py +++ b/apps/sft/main.py @@ -81,34 +81,8 @@ def __init__(self, config: DictConfig): self.gradient_accumulation_steps = 1 # Example value, adjust as needed self._rank = current_rank().rank self._size = math.prod(current_size().values()) - # self._init_dist() super().__init__(job_config) - # def _init_dist(self): - # """Initializes torch distributed. - - # torchrun normally hands this, but we need to do it ourselves - # in monarch for now. - - # We should consider putting this into ForgeActor, but having this - # be explicit for now. - - # """ - # env = { - # "RANK": str(self._rank), - # "LOCAL_RANK": str(self._rank), - # "LOCAL_WORLD_SIZE": str(self._size), - # "GROUP_RANK": str(self._size), - # "GROUP_WORLD_SIZE": str(self._size), - # "ROLE_RANK": str(self._rank), - # "ROLE_WORLD_SIZE": str(self._size), - # "ROLE_NAME": "rank", - # "WORLD_SIZE": str(self._size), - # "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", - # } - # os.environ.update(env) - # logger.info("env: {}".format(env)) - async def setup_metric_logger(self): """Initialization happens in the main process. Here we just retrieve it""" mlogger: GlobalLoggingActor = await get_or_create_metric_logger() From afca6fd0cda69d830730216704a41d9a498ef594 Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Wed, 12 Nov 2025 10:55:41 -0800 Subject: [PATCH 3/3] Remove hint --- apps/sft/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/sft/main.py b/apps/sft/main.py index b9634392a..936a57701 100644 --- a/apps/sft/main.py +++ b/apps/sft/main.py @@ -85,7 +85,7 @@ def __init__(self, config: DictConfig): async def setup_metric_logger(self): """Initialization happens in the main process. Here we just retrieve it""" - mlogger: GlobalLoggingActor = await get_or_create_metric_logger() + mlogger = await get_or_create_metric_logger() return mlogger def record_batch_metrics(self, data_metrics: list):