Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions torchrec/distributed/train_pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
# pyre-strict


from torchrec.distributed.train_pipeline.pipeline_context import ( # noqa
In,
Out,
TrainPipelineContext,
)
from torchrec.distributed.train_pipeline.train_pipelines import ( # noqa
EvalPipelineSparseDist, # noqa
PrefetchTrainPipelineSparseDist, # noqa
Expand All @@ -30,10 +35,7 @@
ArgInfoStepFactory, # noqa
CallArgs, # noqa
DataLoadingThread, # noqa
In, # noqa
Out, # noqa
SparseDataDistUtil, # noqa
StageOut, # noqa
Tracer, # noqa
TrainPipelineContext, # noqa
)
98 changes: 98 additions & 0 deletions torchrec/distributed/train_pipeline/pipeline_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union

import torch

from torchrec.distributed.embedding_sharding import FusedKJTListSplitsAwaitable
from torchrec.distributed.types import Awaitable, LazyAwaitable
from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
from torchrec.streamable import Multistreamable, Pipelineable

logger: logging.Logger = logging.getLogger(__name__)


In = TypeVar("In", bound=Pipelineable)
Out = TypeVar("Out")


@dataclass
class TrainPipelineContext:
"""
Context information for a `TrainPipelineSparseDist` instance.

Attributes:
input_dist_splits_requests (Dict[str, Awaitable[Any]]): Stores input dist
requests in the splits awaitable stage, which occurs after starting the
input dist.
input_dist_tensors_requests (Dict[str, Awaitable[Any]]): Stores input dist
requests in the tensors awaitable stage, which occurs after calling `wait()`
on the splits awaitable.
module_contexts (Dict[str, Multistreamable]): Stores module contexts from the
input dist for the current batch.
module_contexts_next_batch (Dict[str, Multistreamable]): Stores module contexts
from the input dist for the next batch. (only for version 0)
fused_splits_awaitables (List[Tuple[List[str], FusedKJTListSplitsAwaitable]]):
List of fused splits input dist awaitable and the corresponding module names
of each awaitable.
event: Optional[torch.cuda.Event]: Event to record the completion of this stage
index: Optional[int]: Index of the current batch.
version: int = 0; support for backward compatiblity
"""

# pyre-ignore [4]
input_dist_splits_requests: Dict[str, Awaitable[Any]] = field(default_factory=dict)
# pyre-ignore [4]
input_dist_tensors_requests: Dict[str, Awaitable[Any]] = field(default_factory=dict)
module_contexts: Dict[str, Multistreamable] = field(default_factory=dict)
module_contexts_next_batch: Dict[str, Multistreamable] = field(
default_factory=dict
) # deprecated: to support legacy code
fused_splits_awaitables: List[Tuple[List[str], FusedKJTListSplitsAwaitable]] = (
field(default_factory=list)
)
events: List[torch.Event] = field(default_factory=list)
postproc_fwd_results: Dict[str, Any] = field(default_factory=dict)
index: Optional[int] = None
version: int = (
0 # 1 is current version, 0 is deprecated but supported for backward compatibility
)


@dataclass
class PrefetchTrainPipelineContext(TrainPipelineContext):
module_input_post_prefetch: Dict[str, Multistreamable] = field(default_factory=dict)
module_contexts_post_prefetch: Dict[str, Multistreamable] = field(
default_factory=dict
)
module_input_post_prefetch_next_batch: Dict[str, Multistreamable] = field(
default_factory=dict
)
module_contexts_post_prefetch_next_batch: Dict[str, Multistreamable] = field(
default_factory=dict
)


@dataclass
class EmbeddingTrainPipelineContext(TrainPipelineContext):
embedding_a2a_requests: Dict[
str,
Union[
LazyAwaitable[Multistreamable],
# ManagedCollisionEC/EBC returns tuple of awaitables
Tuple[
LazyAwaitable[KeyedTensor], LazyAwaitable[Optional[KeyedJaggedTensor]]
],
],
] = field(default_factory=dict)
embedding_tensors: List[List[torch.Tensor]] = field(default_factory=list)
embedding_features: List[List[Union[str, List[str]]]] = field(default_factory=list)
detached_embedding_tensors: List[List[torch.Tensor]] = field(default_factory=list)
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
#!/usr/bin/env python3

import copy
import os
from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union
from typing import Any, cast, Dict, List, Optional, Tuple, Type, Union

import click

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from torchrec.distributed.tests.test_fp_embeddingbag_utils import (
create_module_and_freeze,
)
from torchrec.distributed.train_pipeline.pipeline_context import TrainPipelineContext
from torchrec.distributed.train_pipeline.tests.test_train_pipelines_base import (
TrainPipelineSparseDistTestBase,
)
Expand All @@ -73,7 +74,6 @@
PostprocArgInfoStep,
SparseDataDistUtil,
StageOut,
TrainPipelineContext,
)
from torchrec.distributed.types import (
ModuleSharder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from torchrec.distributed.embedding_types import EmbeddingComputeKernel
from torchrec.distributed.test_utils.test_model import ModelInput, TestNegSamplingModule
from torchrec.distributed.train_pipeline.pipeline_context import TrainPipelineContext

from torchrec.distributed.train_pipeline.tests.test_train_pipelines_base import (
TrainPipelineSparseDistTestBase,
Expand All @@ -30,7 +31,6 @@
NodeArgsHelper,
PipelinedForward,
PipelinedPostproc,
TrainPipelineContext,
)
from torchrec.distributed.types import ShardingType
from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
Expand Down
12 changes: 7 additions & 5 deletions torchrec/distributed/train_pipeline/train_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@
from torch.autograd.profiler import record_function
from torchrec.distributed.dist_data import KJTAllToAllTensorsAwaitable
from torchrec.distributed.model_parallel import ShardedModule
from torchrec.distributed.train_pipeline.pipeline_context import (
EmbeddingTrainPipelineContext,
In,
Out,
PrefetchTrainPipelineContext,
TrainPipelineContext,
)
from torchrec.distributed.train_pipeline.utils import (
_override_input_dist_forwards,
_pipeline_detach_model,
Expand All @@ -45,19 +52,14 @@
_wait_for_events,
DataLoadingThread,
EmbeddingPipelinedForward,
EmbeddingTrainPipelineContext,
In,
InSyncEmbeddingPipelinedForward,
Out,
PipelinedForward,
PipelinedPostproc,
PipelineStage,
PrefetchPipelinedForward,
PrefetchTrainPipelineContext,
RunnableType,
StageOut,
StageOutputWithEvent,
TrainPipelineContext,
use_context_for_postprocs,
)
from torchrec.distributed.types import Awaitable
Expand Down
89 changes: 8 additions & 81 deletions torchrec/distributed/train_pipeline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import logging
from collections import defaultdict, deque, OrderedDict
from contextlib import AbstractContextManager
from dataclasses import dataclass, field
from dataclasses import dataclass

from itertools import chain
from threading import Event, Thread
Expand All @@ -40,7 +40,6 @@
import torch
from torch import distributed as dist
from torch.utils.hooks import RemovableHandle
from torchrec.distributed.types import LazyAwaitable

if not torch._running_with_deploy():
from torch.distributed._composable.fsdp.fully_shard import FSDPModule as FSDP2
Expand All @@ -66,6 +65,13 @@ class FSDP2:
)
from torchrec.distributed.embedding_types import KJTList
from torchrec.distributed.model_parallel import DistributedModelParallel, ShardedModule
from torchrec.distributed.train_pipeline.pipeline_context import (
EmbeddingTrainPipelineContext,
In,
Out, # noqa
PrefetchTrainPipelineContext,
TrainPipelineContext,
)

from torchrec.distributed.types import Awaitable, LazyNoWait

Expand All @@ -74,90 +80,11 @@ class FSDP2:

logger: logging.Logger = logging.getLogger(__name__)

import torch

In = TypeVar("In", bound=Pipelineable)
StageOut = TypeVar("StageOut", bound=Pipelineable)
Out = TypeVar("Out")

RunnableType = Callable[..., StageOut]
StageOutputWithEvent = Tuple[Optional[StageOut], Optional[torch.Event]]


@dataclass
class TrainPipelineContext:
"""
Context information for a `TrainPipelineSparseDist` instance.

Attributes:
input_dist_splits_requests (Dict[str, Awaitable[Any]]): Stores input dist
requests in the splits awaitable stage, which occurs after starting the
input dist.
input_dist_tensors_requests (Dict[str, Awaitable[Any]]): Stores input dist
requests in the tensors awaitable stage, which occurs after calling `wait()`
on the splits awaitable.
module_contexts (Dict[str, Multistreamable]): Stores module contexts from the
input dist for the current batch.
module_contexts_next_batch (Dict[str, Multistreamable]): Stores module contexts
from the input dist for the next batch. (only for version 0)
fused_splits_awaitables (List[Tuple[List[str], FusedKJTListSplitsAwaitable]]):
List of fused splits input dist awaitable and the corresponding module names
of each awaitable.
event: Optional[torch.cuda.Event]: Event to record the completion of this stage
index: Optional[int]: Index of the current batch.
version: int = 0; support for backward compatiblity
"""

# pyre-ignore [4]
input_dist_splits_requests: Dict[str, Awaitable[Any]] = field(default_factory=dict)
# pyre-ignore [4]
input_dist_tensors_requests: Dict[str, Awaitable[Any]] = field(default_factory=dict)
module_contexts: Dict[str, Multistreamable] = field(default_factory=dict)
module_contexts_next_batch: Dict[str, Multistreamable] = field(
default_factory=dict
) # deprecated: to support legacy code
fused_splits_awaitables: List[Tuple[List[str], FusedKJTListSplitsAwaitable]] = (
field(default_factory=list)
)
events: List[torch.Event] = field(default_factory=list)
postproc_fwd_results: Dict[str, Any] = field(default_factory=dict)
index: Optional[int] = None
version: int = (
0 # 1 is current version, 0 is deprecated but supported for backward compatibility
)


@dataclass
class PrefetchTrainPipelineContext(TrainPipelineContext):
module_input_post_prefetch: Dict[str, Multistreamable] = field(default_factory=dict)
module_contexts_post_prefetch: Dict[str, Multistreamable] = field(
default_factory=dict
)
module_input_post_prefetch_next_batch: Dict[str, Multistreamable] = field(
default_factory=dict
)
module_contexts_post_prefetch_next_batch: Dict[str, Multistreamable] = field(
default_factory=dict
)


@dataclass
class EmbeddingTrainPipelineContext(TrainPipelineContext):
embedding_a2a_requests: Dict[
str,
Union[
LazyAwaitable[Multistreamable],
# ManagedCollisionEC/EBC returns tuple of awaitables
Tuple[
LazyAwaitable[KeyedTensor], LazyAwaitable[Optional[KeyedJaggedTensor]]
],
],
] = field(default_factory=dict)
embedding_tensors: List[List[torch.Tensor]] = field(default_factory=list)
embedding_features: List[List[Union[str, List[str]]]] = field(default_factory=list)
detached_embedding_tensors: List[List[torch.Tensor]] = field(default_factory=list)


@dataclass
class PipelineStage:
"""
Expand Down
Loading