Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhance] Make sure the FileHandler still alive after torch.compile #1021

Merged
merged 11 commits into from
Mar 30, 2023
3 changes: 2 additions & 1 deletion mmengine/hooks/logger_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,4 +304,5 @@ def after_run(self, runner) -> None:
if not self.keep_local:
os.remove(local_filepath)
runner.logger.info(f'{local_filepath} was removed due to the '
'`self.keep_local=False`')
'`self.keep_local=False`. You can check '
f'the running logs in {out_filepath}')
5 changes: 5 additions & 0 deletions mmengine/runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,11 @@ def build_logger(self,

log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
log_cfg.setdefault('name', self._experiment_name)
# `torch.compile` in PyTorch 2.0 could close all user defined handlers
# unexpectedly. Using file mode 'a' can help prevent abnormal
# termination of the FileHandler and ensure that the log file could
# be continuously updated during the lifespan of the runner.
log_cfg.setdefault('file_mode', 'a')

return MMLogger.get_instance(**log_cfg) # type: ignore

Expand Down
8 changes: 8 additions & 0 deletions tests/test_runner/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1745,6 +1745,14 @@ def test_train_with_compile(self):
runner = Runner.from_cfg(cfg)
runner.train()

runner._maybe_compile('train_step')
# PyTorch 2.0.0 could close the FileHandler after calling of
# ``torch.compile``. So we need to test our file handler still works.
with open(osp.join(f'{runner.log_dir}',
f'{runner.timestamp}.log')) as f:
last_line = f.readlines()[-1]
self.assertTrue(last_line.endswith('please be patient.\n'))

def test_val(self):
cfg = copy.deepcopy(self.epoch_based_cfg)
cfg.experiment_name = 'test_val1'
Expand Down