modelscope · Jintao-Huang · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025
diff --git a/docs/source/BestPractices/Qwen3-VL最佳实践.md b/docs/source/BestPractices/Qwen3-VL最佳实践.md
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -751,7 +751,7 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 - MAX_NUM: 默认为12。
 - INPUT_SIZE: 默认为448。
 
-### internvl2, internvl2_phi3, internvl2_5, internvl3
+### internvl2, internvl2_phi3, internvl2_5, internvl3, internvl3_5
 参数含义可以查看[这里](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)。
 - MAX_NUM: 默认为12。
 - INPUT_SIZE: 默认为448。

diff --git a/docs/source/Megatron-SWIFT/命令行参数.md b/docs/source/Megatron-SWIFT/命令行参数.md
@@ -196,7 +196,7 @@
   - 注意：在"ms-swift<3.7.1"，其默认为None，自动从config.json读取。
 - moe_z_loss_coeff: z-loss 的缩放系数。默认为None。
 - 🔥moe_shared_expert_overlap: 启用共享专家计算与调度器通信之间的重叠。如果不启用此选项，共享专家将在路由专家之后执行。仅在设置了`moe_shared_expert_intermediate_size`时有效。默认为False。
-- 🔥moe_expert_capacity_factor: 每个专家的容量因子，None表示不会丢弃任何token。默认为None。通过设置 `--moe_expert_capacity_factor`，超出专家容量的 token 会基于其被选中的概率被丢弃。可以**令训练负载均匀，提升训练速度**（例如设置为1）。
+- 🔥moe_expert_capacity_factor: 每个专家的容量因子，None表示不会丢弃任何token。默认为None。通过设置 `--moe_expert_capacity_factor`，超出专家容量的 token 会基于其被选中的概率被丢弃。可以**令训练负载均匀，提升训练速度**（例如设置为1或2）。
 - moe_pad_expert_input_to_capacity: 对每个专家（expert）的输入进行填充，使其长度与专家容量（expert capacity length）对齐，默认为False。该操作仅在设置了 `--moe_expert_capacity_factor` 参数后才生效。
 - moe_token_drop_policy: 可选为'probs', 'position'。默认为'probs'。
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -56,6 +56,7 @@ Swift DOCUMENTATION
    BestPractices/GRPO多模态训练.md
    BestPractices/GRPO代码训练.md
    BestPractices/Qwen3最佳实践.md
+   BestPractices/Qwen3-VL最佳实践.md
    BestPractices/Embedding训练.md
    BestPractices/Reranker训练.md
    BestPractices/快速训练VL模型.md

diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -773,7 +773,7 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - MAX_NUM: Default is 12
 - INPUT_SIZE: Default is 448
 
-### internvl2, internvl2_phi3, internvl2_5, internvl3
+### internvl2, internvl2_phi3, internvl2_5, internvl3, internvl3_5
 For the meaning of the arguments, please refer to [here](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)
 - MAX_NUM: Default is 12
 - INPUT_SIZE: Default is 448

diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
@@ -207,7 +207,7 @@ For guidance on selecting parallelization strategies, please refer to the [Train
   - Note: In ms-swift versions earlier than 3.7.1, the default is None and the value is automatically loaded from config.json.
 - moe_z_loss_coeff: Scaling coefficient for z-loss. Default is None.
 - 🔥moe_shared_expert_overlap: Enables overlap between shared expert computation and the dispatcher. If not enabled, shared expert computation will be performed after routing experts. Only effective when `moe_shared_expert_intermediate_size` is set. Default is False.
-- 🔥moe_expert_capacity_factor: Capacity factor for each expert. `None` means no tokens will be dropped. Default is `None`. When `--moe_expert_capacity_factor` is set, tokens exceeding an expert’s capacity will be dropped based on their selection probability. This can **balance the training load and improve training speed** (for example, set it to 1.).
+- 🔥moe_expert_capacity_factor: Capacity factor for each expert. `None` means no tokens will be dropped. Default is `None`. When `--moe_expert_capacity_factor` is set, tokens exceeding an expert’s capacity will be dropped based on their selection probability. This can **balance the training load and improve training speed** (for example, set it to 1. or 2.).
 - moe_pad_expert_input_to_capacity: Pad the input of each expert so that its length aligns with the expert capacity length. Default is `False`. This option only takes effect if `--moe_expert_capacity_factor` is set.
 - moe_token_drop_policy: Options are 'probs' and 'position'. Default is 'probs'.
 

diff --git a/docs/source_en/index.rst b/docs/source_en/index.rst
@@ -58,6 +58,7 @@ Swift DOCUMENTATION
    BestPractices/GRPO-Multi-Modal-Training.md
    BestPractices/GRPO-Code-Training.md
    BestPractices/Qwen3-Best-Practice.md
+   BestPractices/Qwen3-VL-Best-Practice.md
    BestPractices/Embedding.md
    BestPractices/Reranker.md
    BestPractices/Rapidly-Training-VL-model.md

diff --git a/examples/models/qwen3_vl/mcore_full.sh b/examples/models/qwen3_vl/mcore_full.sh
@@ -0,0 +1,47 @@
+# 8 * 80GiB
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+OMP_NUM_THREADS=14 \
+NPROC_PER_NODE=8 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+megatron sft \
+    --load Qwen3-VL-30B-A3B-Instruct-mcore \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+              'swift/VideoChatGPT:Generic#2000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --moe_permute_fusion true \
+    --tensor_model_parallel_size 2 \
+    --expert_model_parallel_size 8 \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 1e-6 \
+    --micro_batch_size 1 \
+    --global_batch_size 4 \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --max_epochs 1 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-5 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-6 \
+    --save megatron_output/Qwen3-VL-30B-A3B-Instruct \
+    --eval_interval 500 \
+    --save_interval 500 \
+    --max_length 4096 \
+    --packing true \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --sequence_parallel true \
+    --moe_expert_capacity_factor 2 \
+    --optimizer_cpu_offload true \
+    --use_precision_aware_optimizer true \
+    --optimizer_offload_fraction 0.2 \
+    --attention_backend flash
diff --git a/examples/models/qwen3_vl/mixed.sh b/examples/models/qwen3_vl/mixed.sh
@@ -0,0 +1,41 @@
+# 2 * 21GiB
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+    --model Qwen/Qwen3-VL-4B-Instruct \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+              'swift/VideoChatGPT:Generic#2000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --attn_impl flash_attn \
+    --padding_free true \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --freeze_vit true \
+    --freeze_aligner true \
+    --packing true \
+    --gradient_checkpointing true \
+    --vit_gradient_checkpointing false \
+    --gradient_accumulation_steps 2 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 4096 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --deepspeed zero2 \
+    --dataset_num_proc 4 \
+    --dataloader_num_workers 4
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -761,8 +761,8 @@ def _prepare_gradient_checkpointing(self, model) -> None:
                         else:
                             vision_tower.gradient_checkpointing_disable()
                             vision_tower.disable_input_require_grads()
-                    except (NotImplementedError, AttributeError):
-                        pass
+                    except (NotImplementedError, AttributeError) as e:
+                        logger.warning(f'prepare gradient_checkpointing failed: {e}')
         # Avoid vit_gradient_checkpointing being overwritten by transformers.Trainer.gradient_checkpointing_enable.
         self.args.gradient_checkpointing = False