diff --git a/apps/sft/main.py b/apps/sft/main.py index 93ba05eed..936a57701 100644 --- a/apps/sft/main.py +++ b/apps/sft/main.py @@ -81,34 +81,8 @@ def __init__(self, config: DictConfig): self.gradient_accumulation_steps = 1 # Example value, adjust as needed self._rank = current_rank().rank self._size = math.prod(current_size().values()) - self._init_dist() super().__init__(job_config) - def _init_dist(self): - """Initializes torch distributed. - - torchrun normally hands this, but we need to do it ourselves - in monarch for now. - - We should consider putting this into ForgeActor, but having this - be explicit for now. - - """ - env = { - "RANK": str(self._rank), - "LOCAL_RANK": str(self._rank), - "LOCAL_WORLD_SIZE": str(self._size), - "GROUP_RANK": str(self._size), - "GROUP_WORLD_SIZE": str(self._size), - "ROLE_RANK": str(self._rank), - "ROLE_WORLD_SIZE": str(self._size), - "ROLE_NAME": "rank", - "WORLD_SIZE": str(self._size), - "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", - } - os.environ.update(env) - logger.info("env: {}".format(env)) - async def setup_metric_logger(self): """Initialization happens in the main process. Here we just retrieve it""" mlogger = await get_or_create_metric_logger()