-
Notifications
You must be signed in to change notification settings - Fork 400
/
profiler.py
263 lines (212 loc) · 11.5 KB
/
profiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# Copyright 2022 MosaicML Composer authors
# SPDX-License-Identifier: Apache-2.0
"""Composer Profiler."""
from __future__ import annotations
import logging
import pathlib
from typing import Callable, Dict, List, Sequence, Tuple, Union
from composer.core.callback import Callback
from composer.core.state import State
from composer.profiler.marker import Marker
from composer.profiler.profiler_action import ProfilerAction
from composer.profiler.system_profiler import SystemProfiler
from composer.profiler.torch_profiler import TorchProfiler
from composer.profiler.trace_handler import TraceHandler
from composer.utils.iter_helpers import ensure_tuple
__all__ = ['Profiler']
log = logging.getLogger(__name__)
try:
import yahp
del yahp
except ImportError:
profiler_scheduler_registry = {}
trace_handler_registry = {}
else:
from composer.profiler.profiler_hparams import profiler_scheduler_registry, trace_handler_registry
class Profiler:
"""Composer Profiler.
See the :doc:`Profiling Guide </trainer/performance_tutorials/profiling>` for additional information.
Args:
schedule ((State) -> ProfilerAction): The profiling scheduling function.
It takes the training state and returns a :class:`.ProfilerAction`.
For convenience, Composer includes a :meth:`~composer.profiler.cyclic_schedule.cyclic_schedule` helper.
.. testsetup::
from composer.profiler import Profiler, cyclic_schedule
original_profiler_init = Profiler.__init__
def new_profiler_init(self, dummy_ellipsis=None, **kwargs):
if 'trace_handlers' not in kwargs:
kwargs['trace_handlers'] = []
original_profiler_init(self, **kwargs)
Profiler.__init__ = new_profiler_init
.. testcode::
from composer.profiler import Profiler, cyclic_schedule
profiler = Profiler(
...,
schedule=cyclic_schedule(
skip_first=1,
wait=0,
warmup=1,
active=4,
repeat=1,
),
)
trace_handlers (TraceHandler | Sequence[TraceHandler]): Trace handlers which record and
save profiling data to traces.
sys_prof_cpu (bool, optional): Whether to record cpu statistics. (default: ``True``).
sys_prof_memory (bool, optional): Whether to record memory statistics. (default: ``False``).
sys_prof_disk (bool, optional): Whether to record disk statistics. (default: ``False``).
sys_prof_net (bool, optional): Whether to record network statistics. (default: ``False``).
sys_prof_stats_thread_interval_seconds (float, optional): Interval to record stats, in seconds.
(default: ``0.5``).
torch_prof_folder (str, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_filename (str, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_artifact_name (str, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_overwrite (bool, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_use_gzip (bool, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_record_shapes (bool, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_profile_memory (bool, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_with_stack (bool, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_with_flops (bool, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
torch_prof_num_traces_to_keep (int, optional): See :class:`~composer.profiler.torch_profiler.TorchProfiler`.
"""
hparams_registry = {
'schedule': profiler_scheduler_registry,
'trace_handlers': trace_handler_registry,
}
def __init__(
self,
schedule: Callable[[State], ProfilerAction],
trace_handlers: List[TraceHandler],
sys_prof_cpu: bool = True,
sys_prof_memory: bool = False,
sys_prof_disk: bool = False,
sys_prof_net: bool = False,
sys_prof_stats_thread_interval_seconds: float = 0.5,
torch_prof_folder: str = '{run_name}/torch_traces',
torch_prof_filename: str = 'rank{rank}.{batch}.pt.trace.json',
torch_prof_artifact_name: str = '{run_name}/torch_traces/rank{rank}.{batch}.pt.trace.json',
torch_prof_overwrite: bool = False,
torch_prof_use_gzip: bool = False,
torch_prof_record_shapes: bool = False,
torch_prof_profile_memory: bool = True,
torch_prof_with_stack: bool = False,
torch_prof_with_flops: bool = True,
torch_prof_num_traces_to_keep: int = -1,
) -> None:
self._names_to_markers: Dict[str, Marker] = {}
self._trace_handlers = list(ensure_tuple(trace_handlers))
self.schedule = schedule
self.state = None
self._callbacks: List[Callback] = []
if sys_prof_cpu or sys_prof_memory or sys_prof_disk or sys_prof_net:
self._callbacks.append(
SystemProfiler(profile_cpu=sys_prof_cpu,
profile_memory=sys_prof_memory,
profile_disk=sys_prof_disk,
profile_net=sys_prof_net,
stats_thread_interval_seconds=sys_prof_stats_thread_interval_seconds))
if torch_prof_record_shapes or torch_prof_profile_memory or torch_prof_with_stack or torch_prof_with_flops:
self._callbacks.append(
TorchProfiler(filename=torch_prof_filename,
folder=torch_prof_folder,
artifact_name=torch_prof_artifact_name,
num_traces_to_keep=torch_prof_num_traces_to_keep,
overwrite=torch_prof_overwrite,
record_shapes=torch_prof_record_shapes,
profile_memory=torch_prof_profile_memory,
use_gzip=torch_prof_use_gzip,
with_stack=torch_prof_with_stack,
with_flops=torch_prof_with_flops))
def bind_to_state(
self,
state: State,
):
"""Bind the profiler to the ``state``.
.. note::
The :class:`.Trainer` automatically invokes this method.
Args:
state (State): The training state.
"""
self.state = state
self.state.callbacks.extend(self._callbacks)
self.state.callbacks.extend(self._trace_handlers)
@property
def trace_handlers(self):
"""Profiler trace handlers."""
return self._trace_handlers
@trace_handlers.setter
def trace_handlers(self, trace_handlers: Union[TraceHandler, Sequence[TraceHandler]]):
"""Profiler trace handlers."""
self._trace_handlers[:] = ensure_tuple(trace_handlers)
def record_chrome_json_trace_file(self, filepath: Union[str, pathlib.Path]):
"""Record trace events in Chrome JSON format in the trace handlers.
See `this document <https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview>`_
for more information about Chrome JSON format.
.. note::
For custom profiling, it is recommended to use :meth:`marker` instead of manually creating a Chrome JSON
trace file. By default, the Composer Profiler will automatically saving :class:`.Marker` events in Chrome
JSON format.
This method exists for external profilers that natively record events in Chrome JSON format (such as the
:class:`~composer.profiler.torch_profiler.TorchProfiler`). These profilers can use this method to route
their profiling traces to the Composer profiler :attr:`~trace_handlers` so events from both the Composer
Profiler and external profilers are recorded in the same trace file.
"""
for recorder in self.trace_handlers:
recorder.process_chrome_json_trace_file(pathlib.Path(filepath))
def marker(
self,
name: str,
actions: Sequence[ProfilerAction] = (ProfilerAction.WARMUP, ProfilerAction.ACTIVE,
ProfilerAction.ACTIVE_AND_SAVE),
record_instant_on_start: bool = False,
record_instant_on_finish: bool = False,
categories: Union[List[str], Tuple[str, ...]] = (),
) -> Marker:
"""Create and get an instance of a :class:`.Marker`.
If a :class:`.Marker` with the specified ``name`` does not already exist, it will be created.
Otherwise, the existing instance will be returned.
.. note::
:meth:`.Profiler.marker()` should be used to construct markers. :class:`.Marker` **should not** be
instantiated directly by the user.
For example:
.. testsetup:: composer.profiler.profiler.Profiler.marker
from composer.profiler import Profiler, cyclic_schedule
profiler = Profiler(schedule=cyclic_schedule(), trace_handlers=[])
profiler.bind_to_state(state)
state.profiler = profiler
.. doctest:: composer.profiler.profiler.Profiler.marker
>>> marker = profiler.marker("foo")
>>> marker
<composer.profiler.marker.Marker object at ...>
Please see :meth:`.Marker.start()` and :meth:`.Marker.finish()` for usage on creating markers to measure duration events,
:meth:`.Marker.instant()` for usage on creating markers to mark instant events and :meth:`.Marker.counter()` for usage on
creating markers for counting.
Args:
name (str): The name for the :class:`.Marker`.
actions (Sequence[ProfilerAction], optional): :class:`.ProfilerAction` states to record on.
Defaults to (:attr:`~.ProfilerAction.WARMUP`, :attr:`~.ProfilerAction.ACTIVE`,
:attr:`~.ProfilerAction.ACTIVE_AND_SAVE`).
record_instant_on_start (bool, optional): Whether to record an instant event whenever the marker is started.
Defaults to ``False``.
record_instant_on_finish (bool, optional): Whether to record an instant event whenever the marker is finished.
Defaults to ``False``.
categories (Union[List[str], Tuple[str, ...]], optional): Categories for this marker. Defaults to ``None``.
Returns:
Marker: Marker instance.
"""
if self.state is None:
raise RuntimeError('Profiler.bind_to_state() must be invoked before the Profiler can be used.')
if name not in self._names_to_markers:
def should_record(state: State) -> bool:
return self.schedule(state) in actions
self._names_to_markers[name] = Marker(
state=self.state,
trace_handlers=self.trace_handlers,
name=name,
should_record=should_record,
record_instant_on_start=record_instant_on_start,
record_instant_on_finish=record_instant_on_finish,
categories=categories,
)
self._names_to_markers[name].categories = categories
return self._names_to_markers[name]