meta-pytorch · tushar00jain · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -11,6 +11,7 @@
 
 import logging
 import math
+import os
 from contextlib import nullcontext
 from types import TracebackType
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -25,6 +26,8 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
+USE_BUCKETIZATION_ENV: str = "TORCHFT_USE_BUCKETIZATION"
+
 
 def extract_local_tensor(t: torch.Tensor) -> torch.Tensor:
     """
@@ -171,7 +174,7 @@ def _average(self) -> list[torch.Tensor]:
 
 
 class _StreamingDiLoCoFragment:
-    bucket_cap_mb: int = 32 * 1024 * 1024
+    bucket_cap_mb: int = 1 * 1024 * 1024 * 1024
     use_bucketization: bool = False
 
     def __init__(
@@ -220,7 +223,11 @@ def __init__(
         if bucket_cap_mb is not None:
             self.bucket_cap_mb = int(bucket_cap_mb * 1024 * 1024)
 
-        self.use_bucketization = use_bucketization
+        if os.getenv(USE_BUCKETIZATION_ENV, "False") == "True":
+            self.use_bucketization = True
+        else:
+            self.use_bucketization = use_bucketization
+
         self.should_quantize = should_quantize
 
         self._grads: Dict[str, torch.Tensor] = {}
@@ -535,14 +542,9 @@ def _bucketize_and_allreduce(
             def callback(
                 fut: torch.futures.Future[list[torch.Tensor]],
             ) -> list[torch.Tensor]:
-                with torch.cuda.stream(self._stream) if self._stream else nullcontext():
-                    nonlocal bucket_tensors, flat_buffer
-                    # Setup stream dependency
-                    fut.wait()
-                    for t, pack_offset, numel in bucket_tensors:
-                        t.copy_(
-                            flat_buffer[pack_offset : pack_offset + numel].view_as(t)
-                        )
+                nonlocal bucket_tensors, flat_buffer
+                for t, pack_offset, numel in bucket_tensors:
+                    t.copy_(flat_buffer[pack_offset : pack_offset + numel].view_as(t))
 
                 return []
 

diff --git a/torchft/manager.py b/torchft/manager.py
@@ -423,7 +423,9 @@ def allreduce(
                     torch.accelerator.current_stream(),
                 )
             else:
-                work = self._pg.allreduce([tensor], reduce_op)
+                opts = AllreduceOptions()
+                opts.reduceOp = reduce_op
+                work = self._pg.allreduce([tensor], opts)
 
             # schedule grad normalization as a continuation
             # on the Future

diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -791,6 +791,7 @@ def _create_pg(self, store: Store, rank: int, world_size: int) -> BaseProcessGro
         # pyre-fixme[16]: no attribute ProcessGroupNCCL
         opts = BaseProcessGroupNCCL.Options()
         opts.config.blocking = False
+        opts.global_ranks_in_group = list(range(world_size))
 
         pg = BaseProcessGroup(store, rank, world_size)
         pg._set_default_backend(ProcessGroup.BackendType.NCCL)