[bugfix] fix ignore_data_skip#9220
Conversation
There was a problem hiding this comment.
Code Review
This pull request modifies the _get_resume_checkpoint method in swift/pipelines/train/sft.py to use trainer.args instead of self.args for configuration retrieval. Feedback points out that trainer.args (typically a TrainingArguments object) may not contain custom attributes like use_flash_ckpt or callbacks, which could lead to AttributeError or logic errors. It is recommended to use trainer.args only for standard fields and continue using self.args for custom pipeline-level flags.
| args = trainer.args | ||
| if args.resume_from_checkpoint: | ||
| return args.resume_from_checkpoint | ||
| resume_checkpoint = None | ||
| # If flash checkpoint is enabled, try to resume from the last complete checkpoint. | ||
| # If the previous training finished, resume_checkpoint stays None. | ||
| if self.args.use_flash_ckpt: | ||
| if args.use_flash_ckpt: | ||
| # resume_checkpoint = <resume_dir>/checkpoint-<step> | ||
| resume_checkpoint = trainer.get_resume_checkpoint() | ||
|
|
||
| # Elastic runs require a universal checkpoint; fall back when missing or incomplete. | ||
| callbacks = set(getattr(self.args, 'callbacks', [])) | ||
| callbacks = set(getattr(args, 'callbacks', [])) | ||
| elastic_enabled = 'deepspeed_elastic' in callbacks |
There was a problem hiding this comment.
Using trainer.args for custom attributes like use_flash_ckpt and callbacks is risky. trainer.args (which is self.args.training_args) is typically a sanitized TrainingArguments object that may not contain the extended attributes defined in SftArguments or BaseArguments.
- Potential AttributeError: If
use_flash_ckptis not defined in theTrainingArgumentsclass used by the trainer, line 252 will raise anAttributeError. - Logic Error:
TrainingArgumentsdoes not have acallbacksattribute by default. Usinggetattr(args, 'callbacks', [])ontrainer.argswill likely return an empty list, which would disable thedeepspeed_elasticcheck even if it was requested in the original configuration.
It is recommended to use trainer.args only for standard fields like resume_from_checkpoint and continue using self.args for custom pipeline-level flags.
| args = trainer.args | |
| if args.resume_from_checkpoint: | |
| return args.resume_from_checkpoint | |
| resume_checkpoint = None | |
| # If flash checkpoint is enabled, try to resume from the last complete checkpoint. | |
| # If the previous training finished, resume_checkpoint stays None. | |
| if self.args.use_flash_ckpt: | |
| if args.use_flash_ckpt: | |
| # resume_checkpoint = <resume_dir>/checkpoint-<step> | |
| resume_checkpoint = trainer.get_resume_checkpoint() | |
| # Elastic runs require a universal checkpoint; fall back when missing or incomplete. | |
| callbacks = set(getattr(self.args, 'callbacks', [])) | |
| callbacks = set(getattr(args, 'callbacks', [])) | |
| elastic_enabled = 'deepspeed_elastic' in callbacks | |
| args = trainer.args | |
| if args.resume_from_checkpoint: | |
| return args.resume_from_checkpoint | |
| resume_checkpoint = None | |
| # If flash checkpoint is enabled, try to resume from the last complete checkpoint. | |
| # If the previous training finished, resume_checkpoint stays None. | |
| if self.args.use_flash_ckpt: | |
| # resume_checkpoint = <resume_dir>/checkpoint-<step> | |
| resume_checkpoint = trainer.get_resume_checkpoint() | |
| # Elastic runs require a universal checkpoint; fall back when missing or incomplete. | |
| callbacks = set(getattr(self.args, 'callbacks', [])) | |
| elastic_enabled = 'deepspeed_elastic' in callbacks |
#8383