Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CodeCamp #29] Add profiler hook functionality #768

Merged
merged 41 commits into from
Dec 27, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
f38596d
[Feature] Add profiler hook functionality
BayMaxBHL Nov 25, 2022
e33e8d7
[Feature] Add profiler hook functionality
BayMaxBHL Nov 28, 2022
8a13f4d
[Feature] Add profiler hook functionality
BayMaxBHL Dec 2, 2022
10d97b3
[Feature] Add profiler hook functionality
BayMaxBHL Dec 2, 2022
a8a9c96
[Feature] Add profiler hook functionality
BayMaxBHL Dec 2, 2022
0658d21
[Feature] Add profiler hook functionality
BayMaxBHL Dec 2, 2022
54cf4b3
[Feature] Add profiler hook functionality
BayMaxBHL Dec 2, 2022
545b16d
[Feature] Add profiler hook functionality
BayMaxBHL Dec 2, 2022
792c376
[Feature] Add profiler hook functionality
BayMaxBHL Dec 2, 2022
bf68591
[Feature] Add profiler hook functionality
BayMaxBHL Dec 2, 2022
231c9ea
[Feature] Add profiler hook functionality
BayMaxBHL Dec 3, 2022
6b7d56d
[Feature] Add profiler hook functionality
BayMaxBHL Dec 3, 2022
9bb741f
[Feature] Add profiler hook functionality
BayMaxBHL Dec 3, 2022
6f24a60
[Feature] Add profiler hook functionality
BayMaxBHL Dec 3, 2022
eac140b
[Feature] Add profiler hook functionality
BayMaxBHL Dec 12, 2022
820eef1
[Feature] Add profiler hook functionality
BayMaxBHL Dec 12, 2022
b0a5a03
Merge branch 'open-mmlab:main' into profiler_hook
BayMaxBHL Dec 12, 2022
9fcef9c
[Feature] Add profiler hook functionality
BayMaxBHL Dec 12, 2022
83e98dd
[Feature] Add profiler hook functionality
BayMaxBHL Dec 12, 2022
c8e96ee
[Feature] Add profiler hook functionality
BayMaxBHL Dec 12, 2022
c835d65
[Feature] Add profiler hook functionality
BayMaxBHL Dec 12, 2022
d56f7b4
Merge branch 'open-mmlab:main' into profiler_hook
BayMaxBHL Dec 12, 2022
d1d1fe7
[Feature] Add profiler hook functionality
BayMaxBHL Dec 14, 2022
65a534e
[Feature] Add profiler hook functionality
BayMaxBHL Dec 14, 2022
48c31ad
[Feature] Add profiler hook functionality
BayMaxBHL Dec 14, 2022
c3531c6
[Feature] Add profiler hook functionality
BayMaxBHL Dec 14, 2022
141996c
[Feature] Add profiler hook functionality
BayMaxBHL Dec 14, 2022
cf2f25a
[Feature] Add profiler hook functionality
BayMaxBHL Dec 14, 2022
355b7ae
[Feature] Add profiler hook functionality
BayMaxBHL Dec 15, 2022
17508e4
[Feature] Add profiler hook functionality
BayMaxBHL Dec 15, 2022
f79463c
[Feature] Add profiler hook functionality
BayMaxBHL Dec 15, 2022
bfa23c4
Merge branch 'open-mmlab:main' into profiler_hook
BayMaxBHL Dec 15, 2022
b701a01
Merge branch 'profiler_hook' of github.com:BayMaxBHL/mmengine into pr…
BayMaxBHL Dec 15, 2022
bee0655
[Feature] Add profiler hook functionality
BayMaxBHL Dec 15, 2022
a5a01e7
[Feature] Add profiler hook functionality
BayMaxBHL Dec 15, 2022
c86172f
[Feature] Add profiler hook functionality
BayMaxBHL Dec 15, 2022
e704a13
[Feature] Add profiler hook functionality
BayMaxBHL Dec 16, 2022
0627602
[Feature] Add profiler hook functionality
BayMaxBHL Dec 22, 2022
7da4484
Merge branch 'open-mmlab:main' into profiler_hook
BayMaxBHL Dec 22, 2022
d41afb7
Apply suggestions from code review
zhouzaida Dec 27, 2022
aebd381
Update mmengine/hooks/profiler_hook.py
zhouzaida Dec 27, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion mmengine/hooks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
from .logger_hook import LoggerHook
from .naive_visualization_hook import NaiveVisualizationHook
from .param_scheduler_hook import ParamSchedulerHook
from .profiler_hook import ProfilerHook
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
from .runtime_info_hook import RuntimeInfoHook
from .sampler_seed_hook import DistSamplerSeedHook
from .sync_buffer_hook import SyncBuffersHook

__all__ = [
'Hook', 'IterTimerHook', 'DistSamplerSeedHook', 'ParamSchedulerHook',
'SyncBuffersHook', 'EmptyCacheHook', 'CheckpointHook', 'LoggerHook',
'NaiveVisualizationHook', 'EMAHook', 'RuntimeInfoHook'
'NaiveVisualizationHook', 'EMAHook', 'RuntimeInfoHook', 'ProfilerHook'
]
213 changes: 213 additions & 0 deletions mmengine/hooks/profiler_hook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import warnings
from typing import Callable, List, Optional, Union

import torch

from mmengine.dist import master_only
from mmengine.hooks import Hook
from mmengine.registry import HOOKS


@HOOKS.register_module()
class ProfilerHook(Hook):
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
"""ProfilerHook to analyze performance during training.
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved

PyTorch Profiler is a tool that allows the collection of the performance
metrics during the training. More details on Profiler can be found at
https://pytorch.org/docs/1.13.1/profiler.html#torch.profiler.profile
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved

Args:
by_epoch (bool): Profile performance by epoch or by iteration.
Default: True.
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
profile_iters (int): The period (epoch/iter) recorded by the profiler.
Eg: profile_iters=10 and by_epoch=False, record 0-10 iteration.
Default: 1.
activities (list[str], optional): List of activity groups (CPU, CUDA)
to use in profiling.
Eg: (enumerate) ['cpu'],['cuda'],['cpu', 'cuda']
Default: None, mean ['cpu', 'cuda'].
schedule (dict, optional): Config of generating the callable schedule.
The dict can include wait、warmup、active、repeat、skip_first.
Default: None, mean not add step markers
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
on_trace_ready (callable, dict, optional): Either a handler or a dict
of generate handler.
[Terminal] dict(type='log_trace')
[Tensorboard] dict(type='tb_trace', **trace_cfg)
trace_cfg include dir_name、worker_name、use_gzip
dir_name default to "{work_dir}/tf_tracing_logs".
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
[Terminal] dict(type='log_trace')
[Tensorboard] dict(type='tb_trace', **trace_cfg)
trace_cfg include dir_nameworker_nameuse_gzip
dir_name default to "{work_dir}/tf_tracing_logs".
- Terminal: dict(type='log_trace')
- Tensorboard: dict(type='tb_trace', **trace_cfg)
trace_cfg include dir_nameworker_nameuse_gzip
dir_name default to "{work_dir}/tf_tracing_logs".

Default: None, mean
record_shapes (bool): Save information about operator's input shapes.
Default: False.
profile_memory (bool): Track tensor memory allocation/deallocation.
Default: False.
with_stack (bool): Record source information (file and line number)
for the ops.
Default: False.
with_flops (bool): Use formula to estimate the FLOPS of specific
operators (matrix multiplication and 2D convolution).
Default: False.
json_trace_path (str, optional): Exports the collected trace in Chrome
JSON format. Chrome use 'chrome://tracing' view json file.
Default: None, mean don't save json.
Examples:
>>> # tensorboard trace
>>> trace_config = dict(type='tb_trace', dir_name='work_dir')
>>> profiler_hook_cfg = dict(on_trace_ready=trace_config)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Examples:
>>> # tensorboard trace
>>> trace_config = dict(type='tb_trace', dir_name='work_dir')
>>> profiler_hook_cfg = dict(on_trace_ready=trace_config)
Examples:
>>> # tensorboard trace
>>> trace_config = dict(type='tb_trace', dir_name='work_dir')
>>> profiler_hook_cfg = dict(on_trace_ready=trace_config)


# >>> runner.register_profiler_hook(profiler_hook_cfg)
# >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)])
"""

def __init__(self,
by_epoch: bool = True,
profile_iters: int = 1,
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
activities: Optional[List[str]] = None,
schedule: Optional[dict] = None,
on_trace_ready: Optional[Union[Callable, dict]] = None,
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
record_shapes: bool = False,
profile_memory: bool = False,
with_stack: bool = False,
with_flops: bool = False,
json_trace_path: Optional[str] = None) -> None:

try:
from torch import profiler # torch version >= 1.8.1
except ImportError:
raise ImportError('profiler is the new feature of torch1.8.1, '
f'but your version is {torch.__version__}')

assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
self.by_epoch = by_epoch

if profile_iters < 1:
raise ValueError('profile_iters should be greater than 0, but got '
f'{profile_iters}')
self.profile_iters = profile_iters

if activities is None:
activities = ['cpu', 'cuda']
if not isinstance(activities, list):
raise ValueError(
f'activities should be list, but got {type(activities)}')
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
self.activities = []
for activity in activities:
activity = activity.lower()
if activity == 'cpu':
self.activities.append(profiler.ProfilerActivity.CPU)
elif activity == 'cuda':
self.activities.append(profiler.ProfilerActivity.CUDA)
else:
raise ValueError(
f'activity should be "cpu" or "cuda", but got {activity}')

if schedule is not None:
HAOCHENYE marked this conversation as resolved.
Show resolved Hide resolved
self.schedule = profiler.schedule(**schedule)
else:
self.schedule = None
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved

self.on_trace_ready = on_trace_ready
self.record_shapes = record_shapes
self.profile_memory = profile_memory
self.with_stack = with_stack
self.with_flops = with_flops
self.json_trace_path = json_trace_path

@master_only
def before_run(self, runner):
"""Initialize the profiler."""
if self.by_epoch and runner.max_epochs < self.profile_iters:
raise ValueError('self.profile_iters should not be greater than '
f'{runner.max_epochs}')
if not self.by_epoch and runner.max_iters < self.profile_iters:
raise ValueError('self.profile_iters should not be greater than '
f'{runner.max_iters}')

_on_trace_ready = self._parse_on_trace_ready(runner)
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved

if self.by_epoch and self.profile_iters > 1:
warnings.warn(
f'Profiler will profile 0-{self.profile_iters} epochs.\n'
'Since profiler will slow down the training, it is recommended'
' to train 1 epoch with ProfilerHook and adjust your setting '
'according to the profiler summary.\n'
'During normal training(epoch > 1), '
'you may disable the ProfilerHook.')

self.profiler = torch.profiler.profile( # noqa
activities=self.activities,
schedule=self.schedule,
on_trace_ready=_on_trace_ready,
record_shapes=self.record_shapes,
profile_memory=self.profile_memory,
with_stack=self.with_stack,
with_flops=self.with_flops)

self.profiler.__enter__()
HAOCHENYE marked this conversation as resolved.
Show resolved Hide resolved
runner.logger.info('profiler is profiling...')

def _parse_on_trace_ready(self, runner):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing return type hint

"""Used to parse the parameter 'on_trace_ready'."""
if callable(self.on_trace_ready):
_on_trace_ready = self.on_trace_ready

elif isinstance(self.on_trace_ready, dict):
trace_cfg = self.on_trace_ready.copy()
trace_type = trace_cfg.pop('type')

if trace_type == 'log_trace': # log_trace handler

def _log_handler(prof):
print(prof.key_averages().table(**trace_cfg))

_on_trace_ready = _log_handler

elif trace_type == 'tb_trace': # tensorboard_trace handler
try:
import torch_tb_profiler # noqa: F401
except ImportError:
raise ImportError(
'please run "pip install torch-tb-profiler"')
if 'dir_name' not in trace_cfg:
trace_cfg['dir_name'] = osp.join(runner.work_dir,
'tf_tracing_logs')
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
elif not osp.isabs(trace_cfg['dir_name']):
trace_cfg['dir_name'] = osp.join(runner.work_dir,
trace_cfg['dir_name'])
runner.logger.info(
'tracing files of ProfilerHook will be saved to '
f"{trace_cfg['dir_name']}.")
if self.json_trace_path is not None:
raise ImportError('json path conflicts, please set '
'json_trace_path to none when using '
'tb_trace')
_on_trace_ready = torch.profiler.tensorboard_trace_handler(
**trace_cfg)
else:
raise ValueError('trace_type should be "log_trace" or '
f'"tb_trace", but got {trace_type}')
elif self.on_trace_ready is None:
_on_trace_ready = None

else:
raise ValueError('on_trace_ready should be handler, dict or None, '
f'but got {type(self.on_trace_ready)}')
return _on_trace_ready

@master_only
def after_train_epoch(self, runner):
if self.by_epoch and runner.epoch == self.profile_iters - 1:
runner.logger.info('profiler may take a few minutes...')
self.profiler.__exit__(None, None, None)
if self.json_trace_path is not None:
self.profiler.export_chrome_trace(self.json_trace_path)

@master_only
def after_train_iter(self, runner, batch_idx, data_batch, outputs):
self.profiler.step()
if not self.by_epoch and runner.iter == self.profile_iters - 1:
runner.logger.info('profiler may take a few minutes...')
self.profiler.__exit__(None, None, None)
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
if self.json_trace_path is not None:
self.profiler.export_chrome_trace(self.json_trace_path)
61 changes: 61 additions & 0 deletions tests/test_hooks/test_profiler_hook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Copyright (c) OpenMMLab. All rights reserved.
from mmengine.hooks import ProfilerHook # noqa
BayMaxBHL marked this conversation as resolved.
Show resolved Hide resolved
from mmengine.testing import RunnerTestCase


class TestProfilerHook(RunnerTestCase):

def test_setup(self):
self.setUp()
self.epoch_based_cfg['custom_hooks'] = [
dict(type='ProfilerHook', priority='NORMAL')
]
self._run()
pass

def test_print_log(self):
self.setUp()
self.epoch_based_cfg['custom_hooks'] = [
dict(
type='ProfilerHook',
priority='NORMAL',
on_trace_ready=dict(type='log_trace'),
)
]
self._run()
pass

def test_json(self):
self.setUp()
self.epoch_based_cfg['custom_hooks'] = [
dict(
type='ProfilerHook',
priority='NORMAL',
json_trace_path='/mnt/d/Experiment/mmengine/001.json')
]
self._run()
pass

def test_tensorboard(self):
self.setUp()
self.epoch_based_cfg['custom_hooks'] = [
dict(
type='ProfilerHook',
priority='NORMAL',
on_trace_ready=dict(
type='tb_trace', dir_name='/mnt/d/Experiment/mmengine/tb'))
]
self._run()
pass

def _run(self):
runner = self.build_runner(self.epoch_based_cfg) # noqa
runner.train()
runner.val()
runner.test()

runner = self.build_runner(self.iter_based_cfg)
runner.train()
runner.val()
runner.test()
pass