From 64e85d01584c1bd6eb74fb3a6d86988ec7fd8dc1 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 13 Nov 2025 21:02:26 +0800
Subject: [PATCH] fix packing_length

---
 docs/source/Megatron-SWIFT/Command-line-parameters.md    | 1 +
 docs/source/Megatron-SWIFT/Mcore-Bridge.md               | 1 +
 docs/source_en/Megatron-SWIFT/Command-line-parameters.md | 2 ++
 docs/source_en/Megatron-SWIFT/Mcore-Bridge.md            | 1 +
 swift/llm/train/sft.py                                   | 1 +
 swift/megatron/convert.py                                | 2 +-
 6 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md
index c43e60923b..18033f3eb1 100644
--- a/docs/source/Megatron-SWIFT/Command-line-parameters.md
+++ b/docs/source/Megatron-SWIFT/Command-line-parameters.md
@@ -5,6 +5,7 @@
 **训练参数**:
 - 🔥micro_batch_size: 每个device的批次大小，默认为1。
 - 🔥global_batch_size: 总批次大小，等价于`micro_batch_size*数据并行大小*梯度累加步数`。默认为16。
+  - 其中，`数据并行大小 (DP) = 总GPU数 / (TP × PP × CP)`。
 - 🔥recompute_granularity: 重新计算激活的粒度，可选项为'full', 'selective'。其中full代表重新计算整个transformer layer，selective代表只计算transformer layer中的核心注意力部分。通常'selective'是推荐的。默认为'selective'。
   - 当你设置为'selective'时，你可以通过指定`--recompute_modules`来选择对哪些部分进行重新计算。
 - 🔥recompute_method: 该参数需将recompute_granularity设置为'full'才生效，可选项为'uniform', 'block'。默认为None。
diff --git a/docs/source/Megatron-SWIFT/Mcore-Bridge.md b/docs/source/Megatron-SWIFT/Mcore-Bridge.md
index c13a62ccac..5579f43f2d 100644
--- a/docs/source/Megatron-SWIFT/Mcore-Bridge.md
+++ b/docs/source/Megatron-SWIFT/Mcore-Bridge.md
@@ -193,6 +193,7 @@ swift infer \
 ## 导出与转换精度测试
 
 Mcore-Bridge除了支持在训练中进行safetensors的转换和保存，也支持了`megatron export`命令用于单独的权重导出。`megatron export`支持在权重转换时，对转换精度进行测试，这在接入新模型时验证接入准确性很有帮助。通常，Megatron-SWIFT已经接入的模型不会出现精度不对齐的情况，你可以放心设置`--test_convert_precision false`。
+- 提示：多模态模型请关注`mean_diff (with loss)`字段，`mean_diff`因包含图像tokens且该部分不计算损失，有较大的diff。
 
 全参数权重：
 ```shell
diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
index 59e94335d9..ee6111bc94 100644
--- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
+++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
@@ -6,6 +6,7 @@
 
 - 🔥micro_batch_size: Batch size per device, default is 1.
 - 🔥global_batch_size: Total batch size, equivalent to `micro_batch_size * data parallel size * gradient accumulation steps`. Default is 16.
+  - Here, `Data Parallelism size (DP) = Total number of GPUs / (TP × PP × CP)`.
 - 🔥recompute_granularity: Granularity of activation recomputation, options are 'full', 'selective'. 'full' means recomputing the entire transformer layer, while 'selective' means only recomputing the core attention part of the transformer layer. 'selective' is generally recommended. Default is 'selective'.
   - When you set it to 'selective', you can specify `--recompute_modules` to choose which parts to recompute.
 - 🔥recompute_method: This parameter takes effect only when recompute_granularity is set to 'full', options are 'uniform', 'block'. Default is None.
@@ -315,6 +316,7 @@ Megatron training parameters are inherited from Megatron parameters and basic pa
 - 🔥packing: Whether to use sequence packing to improve computational efficiency (achieving better load balancing across nodes and processes, and higher GPU utilization), at the cost of additional preprocessing time, while also stabilizing GPU memory usage. Defaults to `False`. Currently supported for CPT, SFT, DPO, KTO and RM.
   - Note: **Sequences within the same batch remain mutually invisible**, except for Qwen3-Next.
   - Note: **Packing reduces the number of samples in the dataset; please adjust the gradient accumulation steps and learning rate accordingly**.
+- packing_length: the length to use for packing. Defaults to None, in which case it is set to max_length.
 - streaming: Stream data loading and processing, default is False.
   - Note: Since the length of a streaming dataset cannot be determined, the `--train_iters` parameter must be set. Also set the `max_epochs` parameter to ensure training exits after the specified number of epochs, and to validate and save the model weights accordingly.
   - Note: Streaming datasets can skip preprocessing wait time by overlapping preprocessing with training. Preprocessing for streaming datasets is performed only on rank 0 and then synchronized to other processes via data distribution. **This is generally less efficient than the data sharding approach used in non-streaming datasets.** When the training world_size is large, preprocessing and data distribution can become a training bottleneck.
diff --git a/docs/source_en/Megatron-SWIFT/Mcore-Bridge.md b/docs/source_en/Megatron-SWIFT/Mcore-Bridge.md
index 1e42db1aab..54a3d05694 100644
--- a/docs/source_en/Megatron-SWIFT/Mcore-Bridge.md
+++ b/docs/source_en/Megatron-SWIFT/Mcore-Bridge.md
@@ -203,6 +203,7 @@ swift infer \
 ## Export and Conversion Precision Testing
 
 In addition to supporting safetensors conversion and saving during training, Mcore-Bridge also supports the `megatron export` command for standalone weight export. `megatron export` supports conversion precision testing during weight conversion, which is very helpful for verifying accuracy when integrating new models. Typically, models already integrated into Megatron-SWIFT will not have precision misalignment issues, so you can confidently set `--test_convert_precision false`.
+- Note: For multimodal models, please focus on the `mean_diff (with loss)` field. The `mean_diff` may show a large difference because it includes image tokens, and loss is not calculated for that portion.
 
 Full parameter weights:
 
diff --git a/swift/llm/train/sft.py b/swift/llm/train/sft.py
index 502b895646..8b5c631a6b 100644
--- a/swift/llm/train/sft.py
+++ b/swift/llm/train/sft.py
@@ -161,6 +161,7 @@ def _post_process_datasets(self, datasets: List) -> List:
                     template,
                     dataset,
                     num_proc=args.dataset_num_proc,
+                    packing_length=args.packing_length,
                     strict=args.strict,
                     load_from_cache_file=args.load_from_cache_file)
             elif args.streaming:
diff --git a/swift/megatron/convert.py b/swift/megatron/convert.py
index 1e943d724f..2dc2263552 100644
--- a/swift/megatron/convert.py
+++ b/swift/megatron/convert.py
@@ -220,7 +220,7 @@ def test_convert_precision(hf_model, mg_model, template, torch_dtype=torch.float
         print(f'token_mean_diff: {token_mean_diff}')
         print(f'mean_diff: {mean_diff}, max_diff: {max_diff}')
         print(f'mean_diff (with loss): {mean_diff_with_loss}, max_diff (with loss): {max_diff_with_loss} '
-              '(Please check that mean_diff is less than 0.1).')
+              '(Please check that mean_diff (with loss) is less than 0.1).')
         hf_tokens = hf_logits.argmax(-1)
         mg_tokens = mg_logits.argmax(-1)
         print(f'hf_tokens: {hf_tokens[0].tolist()}\nmg_tokens: {mg_tokens[0].tolist()}')