From 3fdee454fa133a0cbd45e326b9ca12498fac7fcd Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 28 Feb 2025 10:17:56 +0800
Subject: [PATCH 1/5] support ds partition

---
 swift/llm/ds_config/zero3.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json
index 23bbe0e507..5c0a0ec5df 100644
--- a/swift/llm/ds_config/zero3.json
+++ b/swift/llm/ds_config/zero3.json
@@ -22,6 +22,7 @@
             "device": "none",
             "pin_memory": true
         },
+        "zero_hpz_partition_size": 8,
         "overlap_comm": true,
         "contiguous_gradients": true,
         "sub_group_size": 1e9,

From 25679a32e6954f85ca0a9af2e1f01343f69c024a Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 28 Feb 2025 10:27:30 +0800
Subject: [PATCH 2/5] fix

---
 swift/llm/argument/train_args.py | 6 ++++++
 swift/llm/ds_config/zero3.json   | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
index 152597aaf7..63b639ded7 100644
--- a/swift/llm/argument/train_args.py
+++ b/swift/llm/argument/train_args.py
@@ -162,6 +162,9 @@ class TrainArguments(SwanlabArguments, TorchAccArguments, TunerArguments, Seq2Se
     temperature: float = 0.
     load_args: bool = False
 
+    # zero++
+    zero_hpz_partition_size: Optional[int] = None
+
     def __post_init__(self) -> None:
         if self.resume_from_checkpoint:
             self.resume_from_checkpoint = to_abspath(self.resume_from_checkpoint, True)
@@ -237,6 +240,9 @@ def _init_deepspeed(self):
                     break
 
             self.deepspeed = self.parse_to_dict(self.deepspeed)
+            if self.zero_hpz_partition_size is not None:
+                assert 'zero_optimization' in self.deepspeed
+                self.deepspeed['zero_optimization']['zero_hpz_partition_size'] = self.zero_hpz_partition_size
             logger.info(f'Using deepspeed: {self.deepspeed}')
 
     def _init_liger(self):
diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json
index 5c0a0ec5df..23bbe0e507 100644
--- a/swift/llm/ds_config/zero3.json
+++ b/swift/llm/ds_config/zero3.json
@@ -22,7 +22,6 @@
             "device": "none",
             "pin_memory": true
         },
-        "zero_hpz_partition_size": 8,
         "overlap_comm": true,
         "contiguous_gradients": true,
         "sub_group_size": 1e9,

From ab400fba30e1325fe6816eeeb1292c69e1b37c71 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 28 Feb 2025 11:48:06 +0800
Subject: [PATCH 3/5] add doc

---
 ...5\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 1 +
 docs/source_en/Instruction/Command-line-parameters.md            | 1 +
 2 files changed, 2 insertions(+)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 1b3378dcd7..d07570a021 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -105,6 +105,7 @@
 - 🔥output_dir: 默认为None，设置为`output/<model_name>`
 - 🔥gradient_checkpointing: 是否使用gradient_checkpointing，默认为True
 - 🔥deepspeed: 默认为None。可以设置为'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload'来使用ms-swift内置的deepspeed配置文件
+- zero_hpz_partition_size: 默认为None，这个参数是ZeRO++的特性，即node内模型分片，node间数据分片，如果遇到grad_norm NaN，请尝试使用`--torch_dtype float16`
 - 🔥per_device_train_batch_size: 默认值1
 - 🔥per_device_eval_batch_size: 默认值1
 - weight_decay: weight衰减系数，默认值0.1
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index ffd8053cc6..4969a61ecd 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -109,6 +109,7 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with
 - 🔥output_dir: Defaults to None, set as `output/<model_name>`.
 - 🔥gradient_checkpointing: Whether to use gradient checkpointing, default is True.
 - 🔥deepspeed: Defaults to None. It can be set to 'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload' to use the built-in deepspeed configuration file of ms-swift.
+- zero_hpz_partition_size: Default is `None`. This parameter is a feature of `ZeRO++`, which implements model sharding within nodes and data sharding between nodes. If you encounter grad_norm `NaN` issues, please try using `--torch_dtype float16`
 - 🔥per_device_train_batch_size: Default is 1.
 - 🔥per_device_eval_batch_size: Default is 1.
 - weight_decay: Weight decay coefficient, default value is 0.1.

From 177583f53dcb37fc8e3f25263d5750d63e0864c9 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 28 Feb 2025 12:29:33 +0800
Subject: [PATCH 4/5] add warn message

---
 swift/llm/argument/train_args.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
index 63b639ded7..a2a98c5a28 100644
--- a/swift/llm/argument/train_args.py
+++ b/swift/llm/argument/train_args.py
@@ -243,6 +243,8 @@ def _init_deepspeed(self):
             if self.zero_hpz_partition_size is not None:
                 assert 'zero_optimization' in self.deepspeed
                 self.deepspeed['zero_optimization']['zero_hpz_partition_size'] = self.zero_hpz_partition_size
+                logger.warn('If `zero_hpz_partition_size`(ZeRO++) causes grad_norm NaN, please'
+                            ' try `--torch_dtype float16`')
             logger.info(f'Using deepspeed: {self.deepspeed}')
 
     def _init_liger(self):

From c5486dc0d97597e786b8d27e0aa391aa09dc642f Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 28 Feb 2025 12:35:03 +0800
Subject: [PATCH 5/5] fix

---
 swift/llm/ds_config/zero3.json | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/llm/ds_config/zero3.json b/swift/llm/ds_config/zero3.json
index 23bbe0e507..9e842ce849 100644
--- a/swift/llm/ds_config/zero3.json
+++ b/swift/llm/ds_config/zero3.json
@@ -26,6 +26,8 @@
         "contiguous_gradients": true,
         "sub_group_size": 1e9,
         "reduce_bucket_size": "auto",
+        "zero_quantized_weights": false,
+        "zero_quantized_gradients": false,
         "stage3_prefetch_bucket_size": "auto",
         "stage3_param_persistence_threshold": "auto",
         "stage3_max_live_parameters": 1e9,