diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 1b3378dcd7..d07570a021 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -105,6 +105,7 @@ - 🔥output_dir: 默认为None,设置为`output/` - 🔥gradient_checkpointing: 是否使用gradient_checkpointing,默认为True - 🔥deepspeed: 默认为None。可以设置为'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload'来使用ms-swift内置的deepspeed配置文件 +- zero_hpz_partition_size: 默认为None,这个参数是ZeRO++的特性,即node内模型分片,node间数据分片,如果遇到grad_norm NaN,请尝试使用`--torch_dtype float16` - 🔥per_device_train_batch_size: 默认值1 - 🔥per_device_eval_batch_size: 默认值1 - weight_decay: weight衰减系数,默认值0.1 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index ffd8053cc6..4969a61ecd 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -109,6 +109,7 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with - 🔥output_dir: Defaults to None, set as `output/`. - 🔥gradient_checkpointing: Whether to use gradient checkpointing, default is True. - 🔥deepspeed: Defaults to None. It can be set to 'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload' to use the built-in deepspeed configuration file of ms-swift. +- zero_hpz_partition_size: Default is `None`. This parameter is a feature of `ZeRO++`, which implements model sharding within nodes and data sharding between nodes. If you encounter grad_norm `NaN` issues, please try using `--torch_dtype float16` - 🔥per_device_train_batch_size: Default is 1. - 🔥per_device_eval_batch_size: Default is 1. - weight_decay: Weight decay coefficient, default value is 0.1. diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py index 152597aaf7..a2a98c5a28 100644 --- a/swift/llm/argument/train_args.py +++ b/swift/llm/argument/train_args.py @@ -162,6 +162,9 @@ class TrainArguments(SwanlabArguments, TorchAccArguments, TunerArguments, Seq2Se temperature: float = 0. load_args: bool = False + # zero++ + zero_hpz_partition_size: Optional[int] = None + def __post_init__(self) -> None: if self.resume_from_checkpoint: self.resume_from_checkpoint = to_abspath(self.resume_from_checkpoint, True) @@ -237,6 +240,11 @@ def _init_deepspeed(self): break self.deepspeed = self.parse_to_dict(self.deepspeed) + if self.zero_hpz_partition_size is not None: + assert 'zero_optimization' in self.deepspeed + self.deepspeed['zero_optimization']['zero_hpz_partition_size'] = self.zero_hpz_partition_size + logger.warn('If `zero_hpz_partition_size`(ZeRO++) causes grad_norm NaN, please' + ' try `--torch_dtype float16`') logger.info(f'Using deepspeed: {self.deepspeed}') def _init_liger(self): diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json index 23bbe0e507..9e842ce849 100644 --- a/swift/llm/ds_config/zero3.json +++ b/swift/llm/ds_config/zero3.json @@ -26,6 +26,8 @@ "contiguous_gradients": true, "sub_group_size": 1e9, "reduce_bucket_size": "auto", + "zero_quantized_weights": false, + "zero_quantized_gradients": false, "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9,