From 3fdee454fa133a0cbd45e326b9ca12498fac7fcd Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Feb 2025 10:17:56 +0800 Subject: [PATCH 1/5] support ds partition --- swift/llm/ds_config/zero3.json | 1 + 1 file changed, 1 insertion(+) diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json index 23bbe0e507..5c0a0ec5df 100644 --- a/swift/llm/ds_config/zero3.json +++ b/swift/llm/ds_config/zero3.json @@ -22,6 +22,7 @@ "device": "none", "pin_memory": true }, + "zero_hpz_partition_size": 8, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e9, From 25679a32e6954f85ca0a9af2e1f01343f69c024a Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Feb 2025 10:27:30 +0800 Subject: [PATCH 2/5] fix --- swift/llm/argument/train_args.py | 6 ++++++ swift/llm/ds_config/zero3.json | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py index 152597aaf7..63b639ded7 100644 --- a/swift/llm/argument/train_args.py +++ b/swift/llm/argument/train_args.py @@ -162,6 +162,9 @@ class TrainArguments(SwanlabArguments, TorchAccArguments, TunerArguments, Seq2Se temperature: float = 0. load_args: bool = False + # zero++ + zero_hpz_partition_size: Optional[int] = None + def __post_init__(self) -> None: if self.resume_from_checkpoint: self.resume_from_checkpoint = to_abspath(self.resume_from_checkpoint, True) @@ -237,6 +240,9 @@ def _init_deepspeed(self): break self.deepspeed = self.parse_to_dict(self.deepspeed) + if self.zero_hpz_partition_size is not None: + assert 'zero_optimization' in self.deepspeed + self.deepspeed['zero_optimization']['zero_hpz_partition_size'] = self.zero_hpz_partition_size logger.info(f'Using deepspeed: {self.deepspeed}') def _init_liger(self): diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json index 5c0a0ec5df..23bbe0e507 100644 --- a/swift/llm/ds_config/zero3.json +++ b/swift/llm/ds_config/zero3.json @@ -22,7 +22,6 @@ "device": "none", "pin_memory": true }, - "zero_hpz_partition_size": 8, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e9, From ab400fba30e1325fe6816eeeb1292c69e1b37c71 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Feb 2025 11:48:06 +0800 Subject: [PATCH 3/5] add doc --- ...5\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 1 + docs/source_en/Instruction/Command-line-parameters.md | 1 + 2 files changed, 2 insertions(+) diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 1b3378dcd7..d07570a021 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -105,6 +105,7 @@ - 🔥output_dir: 默认为None,设置为`output/` - 🔥gradient_checkpointing: 是否使用gradient_checkpointing,默认为True - 🔥deepspeed: 默认为None。可以设置为'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload'来使用ms-swift内置的deepspeed配置文件 +- zero_hpz_partition_size: 默认为None,这个参数是ZeRO++的特性,即node内模型分片,node间数据分片,如果遇到grad_norm NaN,请尝试使用`--torch_dtype float16` - 🔥per_device_train_batch_size: 默认值1 - 🔥per_device_eval_batch_size: 默认值1 - weight_decay: weight衰减系数,默认值0.1 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index ffd8053cc6..4969a61ecd 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -109,6 +109,7 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with - 🔥output_dir: Defaults to None, set as `output/`. - 🔥gradient_checkpointing: Whether to use gradient checkpointing, default is True. - 🔥deepspeed: Defaults to None. It can be set to 'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload' to use the built-in deepspeed configuration file of ms-swift. +- zero_hpz_partition_size: Default is `None`. This parameter is a feature of `ZeRO++`, which implements model sharding within nodes and data sharding between nodes. If you encounter grad_norm `NaN` issues, please try using `--torch_dtype float16` - 🔥per_device_train_batch_size: Default is 1. - 🔥per_device_eval_batch_size: Default is 1. - weight_decay: Weight decay coefficient, default value is 0.1. From 177583f53dcb37fc8e3f25263d5750d63e0864c9 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Feb 2025 12:29:33 +0800 Subject: [PATCH 4/5] add warn message --- swift/llm/argument/train_args.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py index 63b639ded7..a2a98c5a28 100644 --- a/swift/llm/argument/train_args.py +++ b/swift/llm/argument/train_args.py @@ -243,6 +243,8 @@ def _init_deepspeed(self): if self.zero_hpz_partition_size is not None: assert 'zero_optimization' in self.deepspeed self.deepspeed['zero_optimization']['zero_hpz_partition_size'] = self.zero_hpz_partition_size + logger.warn('If `zero_hpz_partition_size`(ZeRO++) causes grad_norm NaN, please' + ' try `--torch_dtype float16`') logger.info(f'Using deepspeed: {self.deepspeed}') def _init_liger(self): From c5486dc0d97597e786b8d27e0aa391aa09dc642f Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 28 Feb 2025 12:35:03 +0800 Subject: [PATCH 5/5] fix --- swift/llm/ds_config/zero3.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json index 23bbe0e507..9e842ce849 100644 --- a/swift/llm/ds_config/zero3.json +++ b/swift/llm/ds_config/zero3.json @@ -26,6 +26,8 @@ "contiguous_gradients": true, "sub_group_size": 1e9, "reduce_bucket_size": "auto", + "zero_quantized_weights": false, + "zero_quantized_gradients": false, "stage3_prefetch_bucket_size": "auto", "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9,