meta-pytorch · TroyGarden · May 19, 2025
diff --git a/torchrec/distributed/train_pipeline/__init__.py b/torchrec/distributed/train_pipeline/__init__.py
@@ -8,6 +8,11 @@
 # pyre-strict
 
 
+from torchrec.distributed.train_pipeline.pipeline_context import (  # noqa
+    In,
+    Out,
+    TrainPipelineContext,
+)
 from torchrec.distributed.train_pipeline.train_pipelines import (  # noqa
     EvalPipelineSparseDist,  # noqa
     PrefetchTrainPipelineSparseDist,  # noqa
@@ -30,10 +35,7 @@
     ArgInfoStepFactory,  # noqa
     CallArgs,  # noqa
     DataLoadingThread,  # noqa
-    In,  # noqa
-    Out,  # noqa
     SparseDataDistUtil,  # noqa
     StageOut,  # noqa
     Tracer,  # noqa
-    TrainPipelineContext,  # noqa
 )
diff --git a/torchrec/distributed/train_pipeline/pipeline_context.py b/torchrec/distributed/train_pipeline/pipeline_context.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+
+import torch
+
+from torchrec.distributed.embedding_sharding import FusedKJTListSplitsAwaitable
+from torchrec.distributed.types import Awaitable, LazyAwaitable
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
+from torchrec.streamable import Multistreamable, Pipelineable
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+In = TypeVar("In", bound=Pipelineable)
+Out = TypeVar("Out")
+
+
+@dataclass
+class TrainPipelineContext:
+    """
+    Context information for a `TrainPipelineSparseDist` instance.
+
+    Attributes:
+        input_dist_splits_requests (Dict[str, Awaitable[Any]]): Stores input dist
+            requests in the splits awaitable stage, which occurs after starting the
+            input dist.
+        input_dist_tensors_requests (Dict[str, Awaitable[Any]]): Stores input dist
+            requests in the tensors awaitable stage, which occurs after calling `wait()`
+            on the splits awaitable.
+        module_contexts (Dict[str, Multistreamable]): Stores module contexts from the
+            input dist for the current batch.
+        module_contexts_next_batch (Dict[str, Multistreamable]): Stores module contexts
+            from the input dist for the next batch. (only for version 0)
+        fused_splits_awaitables (List[Tuple[List[str], FusedKJTListSplitsAwaitable]]):
+            List of fused splits input dist awaitable and the corresponding module names
+            of each awaitable.
+        event: Optional[torch.cuda.Event]: Event to record the completion of this stage
+        index: Optional[int]: Index of the current batch.
+        version: int = 0; support for backward compatiblity
+    """
+
+    # pyre-ignore [4]
+    input_dist_splits_requests: Dict[str, Awaitable[Any]] = field(default_factory=dict)
+    # pyre-ignore [4]
+    input_dist_tensors_requests: Dict[str, Awaitable[Any]] = field(default_factory=dict)
+    module_contexts: Dict[str, Multistreamable] = field(default_factory=dict)
+    module_contexts_next_batch: Dict[str, Multistreamable] = field(
+        default_factory=dict
+    )  # deprecated: to support legacy code
+    fused_splits_awaitables: List[Tuple[List[str], FusedKJTListSplitsAwaitable]] = (
+        field(default_factory=list)
+    )
+    events: List[torch.Event] = field(default_factory=list)
+    postproc_fwd_results: Dict[str, Any] = field(default_factory=dict)
+    index: Optional[int] = None
+    version: int = (
+        0  # 1 is current version, 0 is deprecated but supported for backward compatibility
+    )
+
+
+@dataclass
+class PrefetchTrainPipelineContext(TrainPipelineContext):
+    module_input_post_prefetch: Dict[str, Multistreamable] = field(default_factory=dict)
+    module_contexts_post_prefetch: Dict[str, Multistreamable] = field(
+        default_factory=dict
+    )
+    module_input_post_prefetch_next_batch: Dict[str, Multistreamable] = field(
+        default_factory=dict
+    )
+    module_contexts_post_prefetch_next_batch: Dict[str, Multistreamable] = field(
+        default_factory=dict
+    )
+
+
+@dataclass
+class EmbeddingTrainPipelineContext(TrainPipelineContext):
+    embedding_a2a_requests: Dict[
+        str,
+        Union[
+            LazyAwaitable[Multistreamable],
+            # ManagedCollisionEC/EBC returns tuple of awaitables
+            Tuple[
+                LazyAwaitable[KeyedTensor], LazyAwaitable[Optional[KeyedJaggedTensor]]
+            ],
+        ],
+    ] = field(default_factory=dict)
+    embedding_tensors: List[List[torch.Tensor]] = field(default_factory=list)
+    embedding_features: List[List[Union[str, List[str]]]] = field(default_factory=list)
+    detached_embedding_tensors: List[List[torch.Tensor]] = field(default_factory=list)
diff --git a/torchrec/distributed/train_pipeline/tests/pipeline_benchmarks.py b/torchrec/distributed/train_pipeline/tests/pipeline_benchmarks.py
@@ -10,8 +10,7 @@
 #!/usr/bin/env python3
 
 import copy
-import os
-from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, cast, Dict, List, Optional, Tuple, Type, Union
 
 import click
 

diff --git a/torchrec/distributed/train_pipeline/tests/test_train_pipelines.py b/torchrec/distributed/train_pipeline/tests/test_train_pipelines.py
@@ -47,6 +47,7 @@
 from torchrec.distributed.tests.test_fp_embeddingbag_utils import (
     create_module_and_freeze,
 )
+from torchrec.distributed.train_pipeline.pipeline_context import TrainPipelineContext
 from torchrec.distributed.train_pipeline.tests.test_train_pipelines_base import (
     TrainPipelineSparseDistTestBase,
 )
@@ -73,7 +74,6 @@
     PostprocArgInfoStep,
     SparseDataDistUtil,
     StageOut,
-    TrainPipelineContext,
 )
 from torchrec.distributed.types import (
     ModuleSharder,

diff --git a/torchrec/distributed/train_pipeline/tests/test_train_pipelines_utils.py b/torchrec/distributed/train_pipeline/tests/test_train_pipelines_utils.py
@@ -18,6 +18,7 @@
 
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
 from torchrec.distributed.test_utils.test_model import ModelInput, TestNegSamplingModule
+from torchrec.distributed.train_pipeline.pipeline_context import TrainPipelineContext
 
 from torchrec.distributed.train_pipeline.tests.test_train_pipelines_base import (
     TrainPipelineSparseDistTestBase,
@@ -30,7 +31,6 @@
     NodeArgsHelper,
     PipelinedForward,
     PipelinedPostproc,
-    TrainPipelineContext,
 )
 from torchrec.distributed.types import ShardingType
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor

diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -33,6 +33,13 @@
 from torch.autograd.profiler import record_function
 from torchrec.distributed.dist_data import KJTAllToAllTensorsAwaitable
 from torchrec.distributed.model_parallel import ShardedModule
+from torchrec.distributed.train_pipeline.pipeline_context import (
+    EmbeddingTrainPipelineContext,
+    In,
+    Out,
+    PrefetchTrainPipelineContext,
+    TrainPipelineContext,
+)
 from torchrec.distributed.train_pipeline.utils import (
     _override_input_dist_forwards,
     _pipeline_detach_model,
@@ -45,19 +52,14 @@
     _wait_for_events,
     DataLoadingThread,
     EmbeddingPipelinedForward,
-    EmbeddingTrainPipelineContext,
-    In,
     InSyncEmbeddingPipelinedForward,
-    Out,
     PipelinedForward,
     PipelinedPostproc,
     PipelineStage,
     PrefetchPipelinedForward,
-    PrefetchTrainPipelineContext,
     RunnableType,
     StageOut,
     StageOutputWithEvent,
-    TrainPipelineContext,
     use_context_for_postprocs,
 )
 from torchrec.distributed.types import Awaitable

diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -14,7 +14,7 @@
 import logging
 from collections import defaultdict, deque, OrderedDict
 from contextlib import AbstractContextManager
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 
 from itertools import chain
 from threading import Event, Thread
@@ -40,7 +40,6 @@
 import torch
 from torch import distributed as dist
 from torch.utils.hooks import RemovableHandle
-from torchrec.distributed.types import LazyAwaitable
 
 if not torch._running_with_deploy():
     from torch.distributed._composable.fsdp.fully_shard import FSDPModule as FSDP2
@@ -66,6 +65,13 @@ class FSDP2:
 )
 from torchrec.distributed.embedding_types import KJTList
 from torchrec.distributed.model_parallel import DistributedModelParallel, ShardedModule
+from torchrec.distributed.train_pipeline.pipeline_context import (
+    EmbeddingTrainPipelineContext,
+    In,
+    Out,  # noqa
+    PrefetchTrainPipelineContext,
+    TrainPipelineContext,
+)
 
 from torchrec.distributed.types import Awaitable, LazyNoWait
 
@@ -74,90 +80,11 @@ class FSDP2:
 
 logger: logging.Logger = logging.getLogger(__name__)
 
-import torch
-
-In = TypeVar("In", bound=Pipelineable)
 StageOut = TypeVar("StageOut", bound=Pipelineable)
-Out = TypeVar("Out")
-
 RunnableType = Callable[..., StageOut]
 StageOutputWithEvent = Tuple[Optional[StageOut], Optional[torch.Event]]
 
 
-@dataclass
-class TrainPipelineContext:
-    """
-    Context information for a `TrainPipelineSparseDist` instance.
-
-    Attributes:
-        input_dist_splits_requests (Dict[str, Awaitable[Any]]): Stores input dist
-            requests in the splits awaitable stage, which occurs after starting the
-            input dist.
-        input_dist_tensors_requests (Dict[str, Awaitable[Any]]): Stores input dist
-            requests in the tensors awaitable stage, which occurs after calling `wait()`
-            on the splits awaitable.
-        module_contexts (Dict[str, Multistreamable]): Stores module contexts from the
-            input dist for the current batch.
-        module_contexts_next_batch (Dict[str, Multistreamable]): Stores module contexts
-            from the input dist for the next batch. (only for version 0)
-        fused_splits_awaitables (List[Tuple[List[str], FusedKJTListSplitsAwaitable]]):
-            List of fused splits input dist awaitable and the corresponding module names
-            of each awaitable.
-        event: Optional[torch.cuda.Event]: Event to record the completion of this stage
-        index: Optional[int]: Index of the current batch.
-        version: int = 0; support for backward compatiblity
-    """
-
-    # pyre-ignore [4]
-    input_dist_splits_requests: Dict[str, Awaitable[Any]] = field(default_factory=dict)
-    # pyre-ignore [4]
-    input_dist_tensors_requests: Dict[str, Awaitable[Any]] = field(default_factory=dict)
-    module_contexts: Dict[str, Multistreamable] = field(default_factory=dict)
-    module_contexts_next_batch: Dict[str, Multistreamable] = field(
-        default_factory=dict
-    )  # deprecated: to support legacy code
-    fused_splits_awaitables: List[Tuple[List[str], FusedKJTListSplitsAwaitable]] = (
-        field(default_factory=list)
-    )
-    events: List[torch.Event] = field(default_factory=list)
-    postproc_fwd_results: Dict[str, Any] = field(default_factory=dict)
-    index: Optional[int] = None
-    version: int = (
-        0  # 1 is current version, 0 is deprecated but supported for backward compatibility
-    )
-
-
-@dataclass
-class PrefetchTrainPipelineContext(TrainPipelineContext):
-    module_input_post_prefetch: Dict[str, Multistreamable] = field(default_factory=dict)
-    module_contexts_post_prefetch: Dict[str, Multistreamable] = field(
-        default_factory=dict
-    )
-    module_input_post_prefetch_next_batch: Dict[str, Multistreamable] = field(
-        default_factory=dict
-    )
-    module_contexts_post_prefetch_next_batch: Dict[str, Multistreamable] = field(
-        default_factory=dict
-    )
-
-
-@dataclass
-class EmbeddingTrainPipelineContext(TrainPipelineContext):
-    embedding_a2a_requests: Dict[
-        str,
-        Union[
-            LazyAwaitable[Multistreamable],
-            # ManagedCollisionEC/EBC returns tuple of awaitables
-            Tuple[
-                LazyAwaitable[KeyedTensor], LazyAwaitable[Optional[KeyedJaggedTensor]]
-            ],
-        ],
-    ] = field(default_factory=dict)
-    embedding_tensors: List[List[torch.Tensor]] = field(default_factory=list)
-    embedding_features: List[List[Union[str, List[str]]]] = field(default_factory=list)
-    detached_embedding_tensors: List[List[torch.Tensor]] = field(default_factory=list)
-
-
 @dataclass
 class PipelineStage:
     """