From adf489e8d32c58871460a2d79ad15a0d2bec697d Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Wed, 12 Nov 2025 10:35:17 -0800
Subject: [PATCH 1/3] Remove _init_dist() method - now handled by provisioner

The _init_dist() method was setting up PyTorch distributed environment
variables manually. This is no longer needed because the provisioner's
get_proc_mesh() method now properly calls setup_env_for_distributed()
from Monarch, which handles all distributed environment setup.

This simplifies the code and removes redundant initialization.
---
 apps/sft/main.py | 52 ++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/apps/sft/main.py b/apps/sft/main.py
index 93ba05eed..aad84283b 100644
--- a/apps/sft/main.py
+++ b/apps/sft/main.py
@@ -81,37 +81,37 @@ def __init__(self, config: DictConfig):
         self.gradient_accumulation_steps = 1  # Example value, adjust as needed
         self._rank = current_rank().rank
         self._size = math.prod(current_size().values())
-        self._init_dist()
+        # self._init_dist()
         super().__init__(job_config)
 
-    def _init_dist(self):
-        """Initializes torch distributed.
-
-        torchrun normally hands this, but we need to do it ourselves
-        in monarch for now.
-
-        We should consider putting this into ForgeActor, but having this
-        be explicit for now.
-
-        """
-        env = {
-            "RANK": str(self._rank),
-            "LOCAL_RANK": str(self._rank),
-            "LOCAL_WORLD_SIZE": str(self._size),
-            "GROUP_RANK": str(self._size),
-            "GROUP_WORLD_SIZE": str(self._size),
-            "ROLE_RANK": str(self._rank),
-            "ROLE_WORLD_SIZE": str(self._size),
-            "ROLE_NAME": "rank",
-            "WORLD_SIZE": str(self._size),
-            "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
-        }
-        os.environ.update(env)
-        logger.info("env: {}".format(env))
+    # def _init_dist(self):
+    #     """Initializes torch distributed.
+
+    #     torchrun normally hands this, but we need to do it ourselves
+    #     in monarch for now.
+
+    #     We should consider putting this into ForgeActor, but having this
+    #     be explicit for now.
+
+    #     """
+    #     env = {
+    #         "RANK": str(self._rank),
+    #         "LOCAL_RANK": str(self._rank),
+    #         "LOCAL_WORLD_SIZE": str(self._size),
+    #         "GROUP_RANK": str(self._size),
+    #         "GROUP_WORLD_SIZE": str(self._size),
+    #         "ROLE_RANK": str(self._rank),
+    #         "ROLE_WORLD_SIZE": str(self._size),
+    #         "ROLE_NAME": "rank",
+    #         "WORLD_SIZE": str(self._size),
+    #         "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
+    #     }
+    #     os.environ.update(env)
+    #     logger.info("env: {}".format(env))
 
     async def setup_metric_logger(self):
         """Initialization happens in the main process. Here we just retrieve it"""
-        mlogger = await get_or_create_metric_logger()
+        mlogger: GlobalLoggingActor = await get_or_create_metric_logger()
         return mlogger
 
     def record_batch_metrics(self, data_metrics: list):

From 35d580cca5fa1bd575b503e10f7b31a15ba7377c Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Wed, 12 Nov 2025 10:39:11 -0800
Subject: [PATCH 2/3] Clean up: completely remove commented _init_dist() method

Previously the method was just commented out. This commit fully removes
it since the provisioner now handles all distributed environment setup
via setup_env_for_distributed(). This makes the code cleaner and easier
to maintain.
---
 apps/sft/main.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/apps/sft/main.py b/apps/sft/main.py
index aad84283b..b9634392a 100644
--- a/apps/sft/main.py
+++ b/apps/sft/main.py
@@ -81,34 +81,8 @@ def __init__(self, config: DictConfig):
         self.gradient_accumulation_steps = 1  # Example value, adjust as needed
         self._rank = current_rank().rank
         self._size = math.prod(current_size().values())
-        # self._init_dist()
         super().__init__(job_config)
 
-    # def _init_dist(self):
-    #     """Initializes torch distributed.
-
-    #     torchrun normally hands this, but we need to do it ourselves
-    #     in monarch for now.
-
-    #     We should consider putting this into ForgeActor, but having this
-    #     be explicit for now.
-
-    #     """
-    #     env = {
-    #         "RANK": str(self._rank),
-    #         "LOCAL_RANK": str(self._rank),
-    #         "LOCAL_WORLD_SIZE": str(self._size),
-    #         "GROUP_RANK": str(self._size),
-    #         "GROUP_WORLD_SIZE": str(self._size),
-    #         "ROLE_RANK": str(self._rank),
-    #         "ROLE_WORLD_SIZE": str(self._size),
-    #         "ROLE_NAME": "rank",
-    #         "WORLD_SIZE": str(self._size),
-    #         "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
-    #     }
-    #     os.environ.update(env)
-    #     logger.info("env: {}".format(env))
-
     async def setup_metric_logger(self):
         """Initialization happens in the main process. Here we just retrieve it"""
         mlogger: GlobalLoggingActor = await get_or_create_metric_logger()

From afca6fd0cda69d830730216704a41d9a498ef594 Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Wed, 12 Nov 2025 10:55:41 -0800
Subject: [PATCH 3/3] Remove hint

---
 apps/sft/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/sft/main.py b/apps/sft/main.py
index b9634392a..936a57701 100644
--- a/apps/sft/main.py
+++ b/apps/sft/main.py
@@ -85,7 +85,7 @@ def __init__(self, config: DictConfig):
 
     async def setup_metric_logger(self):
         """Initialization happens in the main process. Here we just retrieve it"""
-        mlogger: GlobalLoggingActor = await get_or_create_metric_logger()
+        mlogger = await get_or_create_metric_logger()
         return mlogger
 
     def record_batch_metrics(self, data_metrics: list):