From 646e87e519b416658901b2b4e4c3d6120fba63e3 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 19 Oct 2025 23:50:19 +0800
Subject: [PATCH 01/11] update qwen3_vl docs

---
 ...00\344\275\263\345\256\236\350\267\265.md" | 348 ++++++++++++++++++
 ...44\350\241\214\345\217\202\346\225\260.md" |   2 +-
 .../BestPractices/Qwen3-VL-Best-Practice.md   | 342 +++++++++++++++++
 .../Instruction/Command-line-parameters.md    |   2 +-
 examples/models/qwen3_vl/mcore_full.sh        |  47 +++
 examples/models/qwen3_vl/mixed.sh             |  41 +++
 swift/trainers/mixin.py                       |   4 +-
 7 files changed, 782 insertions(+), 4 deletions(-)
 create mode 100644 "docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
 create mode 100644 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
 create mode 100644 examples/models/qwen3_vl/mcore_full.sh
 create mode 100644 examples/models/qwen3_vl/mixed.sh

diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
new file mode 100644
index 0000000000..3705d09e22
--- /dev/null
+++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -0,0 +1,348 @@
+
+# Qwen3-VL最佳实践
+
+## 环境准备
+
+在开始推理和训练之前，请确保您的环境已准备妥当。
+
+```shell
+pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14"
+
+pip install "ms-swift>=3.9.1"
+```
+
+
+## 推理
+
+使用transformers推理：
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+from modelscope import snapshot_download
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+
+model_dir = snapshot_download('Qwen/Qwen3-VL-4B-Instruct')
+
+model = Qwen3VLForConditionalGeneration.from_pretrained(
+    model_dir, dtype="auto", device_map="auto",
+    # attn_implementation='flash_attention_2',
+)
+
+processor = AutoProcessor.from_pretrained(model_dir)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": "https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4",
+                "max_pixels": 128*32*32,
+                "max_frames": 16,
+            },
+            {"type": "text", "text": "Describe this video."},
+        ],
+    }
+]
+
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True,
+                                                                image_patch_size= 16,
+                                                                return_video_metadata=True)
+if video_inputs is not None:
+    video_inputs, video_metadatas = zip(*video_inputs)
+    video_inputs, video_metadatas = list(video_inputs), list(video_metadatas)
+else:
+    video_metadatas = None
+inputs = processor(text=[text], images=image_inputs, videos=video_inputs, video_metadata=video_metadatas, **video_kwargs, do_resize=False, return_tensors="pt")
+inputs = inputs.to('cuda')
+
+generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text[0])
+# 'A baby wearing glasses sits on a bed, engrossed in reading a book. The baby turns the pages with both hands, occasionally looking up and smiling. The room is cozy, with a crib in the background and clothes scattered around. The baby’s focus and curiosity are evident as they explore the book, creating a heartwarming scene of early learning and discovery.'
+```
+
+使用ms-swift的PtEngine进行推理：
+```python
+import os
+os.environ['SWIFT_DEBUG'] = '1'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+os.environ['VIDEO_MAX_TOKEN_NUM'] = '128'
+os.environ['FPS_MAX_FRAMES'] = '16'
+
+
+from swift.llm import PtEngine, InferRequest, RequestConfig
+engine = PtEngine('Qwen/Qwen3-VL-4B-Instruct', attn_impl='flash_attention_2')
+infer_request = InferRequest(messages=[{
+    "role": "user",
+    "content": '<video>Describe this video.',
+}], videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'])
+request_config = RequestConfig(max_tokens=128, temperature=0)
+resp_list = engine.infer([infer_request], request_config=request_config)
+response = resp_list[0].choices[0].message.content
+# 'A baby wearing glasses sits on a bed, engrossed in reading a book. The baby turns the pages with both hands, occasionally looking up and smiling. The room is cozy, with a crib in the background and clothes scattered around. The baby’s focus and curiosity are evident as they explore the book, creating a heartwarming scene of early learning and discovery.'
+
+# use stream
+request_config = RequestConfig(max_tokens=128, temperature=0, stream=True)
+gen_list = engine.infer([infer_request], request_config=request_config)
+for chunk in gen_list[0]:
+    if chunk is None:
+        continue
+    print(chunk.choices[0].delta.content, end='', flush=True)
+print()
+```
+
+使用命令行推理：
+```shell
+CUDA_VISIBLE_DEVICES=4 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+swift infer \
+    --model Qwen/Qwen3-VL-4B-Instruct \
+    --stream true
+```
+
+```
+<<< who are you?
+Hello! I'm Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. My main functions include answering questions, creating text such as stories, official documents, emails, scripts, and more, as well as performing logical reasoning, programming, and other tasks. If you have any questions or need assistance, feel free to let me know anytime, and I'll do my best to help!
+--------------------------------------------------
+<<< <image>describe the image.
+Input an image path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
+This is a beautifully detailed, close-up portrait of an adorable tabby kitten, rendered with a soft, painterly effect that gives it a gentle, dreamy quality.
+
+Here’s a breakdown of the image:
+
+- **The Kitten:** The subject is a young, fluffy kitten with a classic tabby pattern. Its fur is a mix of white and soft grayish-brown stripes, with a prominent dark stripe running down the center of its forehead and over its nose. The kitten’s face is predominantly white, with delicate markings around its eyes and cheeks.
+
+- **The Eyes:** Its most captivating feature is its large, round, and expressive eyes. They are a striking shade of bright blue-gray, with dark pupils that give it an intense, curious, and slightly innocent gaze. The eyes are wide open, suggesting the kitten is alert and attentive.
+
+- **The Expression:** The kitten’s expression is sweet and innocent. Its small pink nose and slightly parted mouth give it a gentle, almost pleading look. Its whiskers are long and white, standing out against its fur.
+
+- **The Style:** The image has a soft-focus, artistic quality, reminiscent of impressionist painting. The edges of the kitten’s fur are slightly blurred, creating a halo effect that draws attention to its face. The background is softly blurred with muted tones of green and gray, which helps the kitten stand out as the clear focal point.
+
+- **Overall Impression:** The image evokes feelings of warmth, cuteness, and tenderness. The kitten appears to be looking directly at the viewer, creating a sense of connection and affection.
+
+This is a lovely and charming depiction of a young kitten, capturing its innocence and charm in a visually appealing and emotionally engaging way.
+--------------------------------------------------
+<<< <video>describe the video.
+Input a video path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4
+This video captures a charming and adorable moment of a young child, likely a toddler, sitting on a bed and pretending to read a book. The child is wearing glasses, which adds a humorous and endearing touch to the scene — as if they’re a little scholar or librarian.
+
+Here’s a breakdown of what unfolds:
+
+- The child is seated cross-legged on a bed with a patterned quilt. Behind them, a crib and some household items are visible, suggesting a cozy bedroom setting.
+
+- The child holds an open book and appears to be turning the pages with focused attention, mimicking the behavior of a real reader.
+
+- At one point, the child looks up, smiles, or seems to react with delight — perhaps amused by something in the book or just enjoying the activity.
+
+- The child’s movements are gentle and deliberate, showing a sense of concentration and curiosity. They turn pages, sometimes with one hand, and occasionally lift the book slightly as if to examine it more closely.
+
+- The video has a warm, candid feel — it’s not staged, and the child’s natural behavior makes it feel authentic and heartwarming.
+
+Overall, this is a sweet, lighthearted video that showcases the innocence and imagination of early childhood. The child’s engagement with the book, combined with their glasses and playful demeanor, creates a delightful and memorable scene.
+```
+
+- 其中特定模型参数，例如`VIDEO_MAX_TOKEN_NUM`等环境变量的含义参考[命令行参数文档](../Instruction/命令行参数.md#qwen3_vl)。
+
+
+## 训练
+
+这里将介绍如何使用ms-swift与Megatron-SWIFT对Qwen3-VL进行训练。推荐Dense模型使用ms-swift（即transformers后端，更加方便简单），而Moe模型使用Megatron-SWIFT（即megatron后端，更快的训练速度，benchmark查看[这里](../Megatron-SWIFT/快速开始.md#benchmark)）。
+
+如果您需要自定义数据集微调模型，你可以将数据准备成以下格式，并在命令行中设置`--dataset train.jsonl --val_dataset val.jsonl`，其中验证集为可选。更多介绍请参考[多模态数据集文档](../Customization/自定义数据集.md#多模态)。
+```jsonl
+{"messages": [{"role": "user", "content": "浙江的省会在哪？"}, {"role": "assistant", "content": "浙江的省会在杭州。"}]}
+{"messages": [{"role": "user", "content": "<image><image>两张图片有什么区别"}, {"role": "assistant", "content": "前一张是小猫，后一张是小狗"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
+{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "<image>图片中是什么，<video>视频中是什么"}, {"role": "assistant", "content": "图片中是一个大象，视频中是一只小狗在草地上奔跑"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+```
+
+Qwen3-VL的bbox输出采用归一化1000的相对坐标。你可以使用ms-swift提供的grounding数据集格式，其中"bbox"中的坐标为绝对坐标，ms-swift会自动将绝对坐标转为归一化1000的相对坐标。更多信息请参考[grounding数据集格式文档](../Customization/自定义数据集.md#grounding)。
+```jsonl
+{"messages": [{"role": "user", "content": "<image>找到图像中的<ref-object>"}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
+```
+
+### Dense模型
+
+以下提供对`Qwen3-VL-4B-Instruct`模型的微调脚本，我们使用混合模态数据作为Demo数据集，该示例脚本没有实用价值。训练显存为2 * 21GiB，训练时间为12分钟。
+
+```shell
+# 2 * 21GiB
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+    --model Qwen/Qwen3-VL-4B-Instruct \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+              'swift/VideoChatGPT:Generic#2000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --attn_impl flash_attn \
+    --padding_free true \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --freeze_vit true \
+    --freeze_aligner true \
+    --packing true \
+    --gradient_checkpointing true \
+    --vit_gradient_checkpointing false \
+    --gradient_accumulation_steps 2 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 4096 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --deepspeed zero2 \
+    --dataset_num_proc 4 \
+    --dataloader_num_workers 4
+```
+
+训练结束后，我们使用以下脚本对验证集进行推理：
+```shell
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+CUDA_VISIBLE_DEVICES=0 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --max_new_tokens 2048 \
+    --load_data_args true
+```
+
+```
+--------------------------------------------------
+[QUERY] Using LaTeX to perform OCR on the image.
+[LABELS] 1 + \frac { 1 } { 1 ! } + \frac { 1 } { 2 ! } + \frac { 1 } { 3 ! } + \frac { 1 } { 4 ! }
+[RESPONSE] 1 + \frac { 1 } { 1 ! } + \frac { 1 } { 2 ! } + \frac { 1 } { 3 ! } + \frac { 1 } { 4 ! }
+--------------------------------------------------
+[QUERY] What color suit is the man wearing while playing the saxophone on stage?
+[LABELS] The man is wearing a black suit and white shirt while playing the saxophone on the red-floored stage.
+[RESPONSE] The man is wearing a black suit while playing the saxophone on stage.
+--------------------------------------------------
+...
+```
+
+### Moe模型
+
+
+以下提供对`Qwen3-VL-30B-A3B-Instruct`模型的微调脚本，我们使用Megatron-SWIFT进行单机全参数训练。我们依旧采用混合数据进行训练，该示例脚本没有实用价值。训练所需显存资源为8 * 80GiB，训练时间为20分钟。
+
+关于Megatron-SWIFT的环境安装，请参考[Megatron-SWIFT文档](../Megatron-SWIFT/快速开始.md)。Megatron-SWIFT与ms-swift共用template和dataset模块，因此前面介绍的自定义数据集格式和模型特有环境变量依旧生效。
+
+HF格式权重转为Megatron格式：
+```shell
+CUDA_VISIBLE_DEVICES=0,1 \
+swift export \
+    --model Qwen/Qwen3-VL-30B-A3B-Instruct \
+    --to_mcore true \
+    --torch_dtype bfloat16 \
+    --output_dir Qwen3-VL-30B-A3B-Instruct-mcore
+```
+
+微调脚本如下，训练技巧与并行技术的调整参考[Megatron-SWIFT文档](../Megatron-SWIFT/快速开始.md#训练技巧)。
+```shell
+# 8 * 80GiB
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+OMP_NUM_THREADS=14 \
+NPROC_PER_NODE=8 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+megatron sft \
+    --load Qwen3-VL-30B-A3B-Instruct-mcore \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+              'swift/VideoChatGPT:Generic#2000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --moe_permute_fusion true \
+    --tensor_model_parallel_size 2 \
+    --expert_model_parallel_size 8 \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 1e-6 \
+    --micro_batch_size 1 \
+    --global_batch_size 4 \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --max_epochs 1 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-5 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-6 \
+    --save megatron_output/Qwen3-VL-30B-A3B-Instruct \
+    --eval_interval 500 \
+    --save_interval 500 \
+    --max_length 4096 \
+    --packing true \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --sequence_parallel true \
+    --moe_expert_capacity_factor 1 \
+    --optimizer_cpu_offload true \
+    --use_precision_aware_optimizer true \
+    --optimizer_offload_fraction 0.2 \
+    --attention_backend flash
+```
+
+Megatron格式权重转为Hf格式：
+```shell
+CUDA_VISIBLE_DEVICES=0,1 \
+swift export \
+    --mcore_adapters megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx \
+    --to_hf true \
+    --torch_dtype bfloat16 \
+    --output_dir megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf
+```
+- 若要调整使用对应iter的权重，请修改`megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx`目录下的`latest_checkpointed_iteration.txt`文件。
+
+
+训练结束后，我们使用以下脚本对验证集进行推理：
+```shell
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+swift infer \
+    --model megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf
+    --stream true \
+    --max_new_tokens 2048 \
+    --load_data_args true
+```
+
+
+使用以下命令将训练权重推送到Modelscope：
+```shell
+swift export \
+    --model output/vx-xxx/checkpoint-xxx \
+    --push_to_hub true \
+    --hub_model_id '<your-model-id>' \
+    --hub_token '<your-sdk-token>'
+```
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 15dccf0e37..43ee2bd0c3 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -751,7 +751,7 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 - MAX_NUM: 默认为12。
 - INPUT_SIZE: 默认为448。
 
-### internvl2, internvl2_phi3, internvl2_5, internvl3
+### internvl2, internvl2_phi3, internvl2_5, internvl3, internvl3_5
 参数含义可以查看[这里](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)。
 - MAX_NUM: 默认为12。
 - INPUT_SIZE: 默认为448。
diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
new file mode 100644
index 0000000000..71ee1656be
--- /dev/null
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -0,0 +1,342 @@
+# Qwen3-VL Best Practices
+## Environment Setup
+Before starting inference and training, please ensure your environment is properly configured.
+
+```shell
+pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14"
+
+pip install "ms-swift>=3.9.1"
+```
+
+## Inference
+Inference using transformers:
+
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+from modelscope import snapshot_download
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+
+model_dir = snapshot_download('Qwen/Qwen3-VL-4B-Instruct')
+
+model = Qwen3VLForConditionalGeneration.from_pretrained(
+    model_dir, dtype="auto", device_map="auto",
+    # attn_implementation='flash_attention_2',
+)
+
+processor = AutoProcessor.from_pretrained(model_dir)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": "https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4",
+                "max_pixels": 128*32*32,
+                "max_frames": 16,
+            },
+            {"type": "text", "text": "Describe this video."},
+        ],
+    }
+]
+
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True,
+                                                                image_patch_size= 16,
+                                                                return_video_metadata=True)
+if video_inputs is not None:
+    video_inputs, video_metadatas = zip(*video_inputs)
+    video_inputs, video_metadatas = list(video_inputs), list(video_metadatas)
+else:
+    video_metadatas = None
+inputs = processor(text=[text], images=image_inputs, videos=video_inputs, video_metadata=video_metadatas, **video_kwargs, do_resize=False, return_tensors="pt")
+inputs = inputs.to('cuda')
+
+generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text[0])
+# 'A baby wearing glasses sits on a bed, engrossed in reading a book. The baby turns the pages with both hands, occasionally looking up and smiling. The room is cozy, with a crib in the background and clothes scattered around. The baby's focus and curiosity are evident as they explore the book, creating a heartwarming scene of early learning and discovery.'
+```
+
+Inference using ms-swift's PtEngine:
+
+```python
+import os
+# os.environ['SWIFT_DEBUG'] = '1'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+os.environ['VIDEO_MAX_TOKEN_NUM'] = '128'
+os.environ['FPS_MAX_FRAMES'] = '16'
+
+
+from swift.llm import PtEngine, InferRequest, RequestConfig
+engine = PtEngine('Qwen/Qwen3-VL-4B-Instruct', attn_impl='flash_attention_2')
+infer_request = InferRequest(messages=[{
+    "role": "user",
+    "content": '<video>Describe this video.',
+}], videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'])
+request_config = RequestConfig(max_tokens=128, temperature=0)
+resp_list = engine.infer([infer_request], request_config=request_config)
+response = resp_list[0].choices[0].message.content
+# 'A baby wearing glasses sits on a bed, engrossed in reading a book. The baby turns the pages with both hands, occasionally looking up and smiling. The room is cozy, with a crib in the background and clothes scattered around. The baby's focus and curiosity are evident as they explore the book, creating a heartwarming scene of early learning and discovery.'
+
+# use stream
+request_config = RequestConfig(max_tokens=128, temperature=0, stream=True)
+gen_list = engine.infer([infer_request], request_config=request_config)
+for chunk in gen_list[0]:
+    if chunk is None:
+        continue
+    print(chunk.choices[0].delta.content, end='', flush=True)
+print()
+```
+
+Inference using command line:
+
+```shell
+CUDA_VISIBLE_DEVICES=4 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+swift infer \
+    --model Qwen/Qwen3-VL-4B-Instruct \
+    --stream true
+```
+
+```
+<<< who are you?
+Hello! I'm Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. My main functions include answering questions, creating text such as stories, official documents, emails, scripts, and more, as well as performing logical reasoning, programming, and other tasks. If you have any questions or need assistance, feel free to let me know anytime, and I'll do my best to help!
+--------------------------------------------------
+<<< <image>describe the image.
+Input an image path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
+This is a beautifully detailed, close-up portrait of an adorable tabby kitten, rendered with a soft, painterly effect that gives it a gentle, dreamy quality.
+
+Here's a breakdown of the image:
+
+- **The Kitten:** The subject is a young, fluffy kitten with a classic tabby pattern. Its fur is a mix of white and soft grayish-brown stripes, with a prominent dark stripe running down the center of its forehead and over its nose. The kitten's face is predominantly white, with delicate markings around its eyes and cheeks.
+
+- **The Eyes:** Its most captivating feature is its large, round, and expressive eyes. They are a striking shade of bright blue-gray, with dark pupils that give it an intense, curious, and slightly innocent gaze. The eyes are wide open, suggesting the kitten is alert and attentive.
+
+- **The Expression:** The kitten's expression is sweet and innocent. Its small pink nose and slightly parted mouth give it a gentle, almost pleading look. Its whiskers are long and white, standing out against its fur.
+
+- **The Style:** The image has a soft-focus, artistic quality, reminiscent of impressionist painting. The edges of the kitten's fur are slightly blurred, creating a halo effect that draws attention to its face. The background is softly blurred with muted tones of green and gray, which helps the kitten stand out as the clear focal point.
+
+- **Overall Impression:** The image evokes feelings of warmth, cuteness, and tenderness. The kitten appears to be looking directly at the viewer, creating a sense of connection and affection.
+
+This is a lovely and charming depiction of a young kitten, capturing its innocence and charm in a visually appealing and emotionally engaging way.
+--------------------------------------------------
+<<< <video>describe the video.
+Input a video path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4
+This video captures a charming and adorable moment of a young child, likely a toddler, sitting on a bed and pretending to read a book. The child is wearing glasses, which adds a humorous and endearing touch to the scene — as if they're a little scholar or librarian.
+
+Here's a breakdown of what unfolds:
+
+- The child is seated cross-legged on a bed with a patterned quilt. Behind them, a crib and some household items are visible, suggesting a cozy bedroom setting.
+
+- The child holds an open book and appears to be turning the pages with focused attention, mimicking the behavior of a real reader.
+
+- At one point, the child looks up, smiles, or seems to react with delight — perhaps amused by something in the book or just enjoying the activity.
+
+- The child's movements are gentle and deliberate, showing a sense of concentration and curiosity. They turn pages, sometimes with one hand, and occasionally lift the book slightly as if to examine it more closely.
+
+- The video has a warm, candid feel — it's not staged, and the child's natural behavior makes it feel authentic and heartwarming.
+
+Overall, this is a sweet, lighthearted video that showcases the innocence and imagination of early childhood. The child's engagement with the book, combined with their glasses and playful demeanor, creates a delightful and memorable scene.
+```
+
+- For model-specific parameters, such as environment variables like `VIDEO_MAX_TOKEN_NUM`, please refer to the [Command Line Parameters Documentation](../Instruction/Command-line-parameters.md#qwen3_vl).
+
+
+## Training
+This section introduces how to train Qwen3-VL using ms-swift and Megatron-SWIFT. We recommend using ms-swift (i.e., transformers backend, which is more convenient and simple) for Dense models, and Megatron-SWIFT (i.e., megatron backend, which offers faster training speed; see benchmark [here](../Megatron-SWIFT/Quick-start.md#benchmark)) for MoE models.
+
+If you need to fine-tune the model with a custom dataset, you can prepare the data in the following format and set `--dataset train.jsonl --val_dataset val.jsonl` in the command line, where the validation set is optional. For more information, please refer to the [Multimodal Dataset Documentation](../Customization/Custom-dataset.md#multimodal).
+
+```jsonl
+{"messages": [{"role": "user", "content": "Where is the capital of Zhejiang?"}, {"role": "assistant", "content": "The capital of Zhejiang is Hangzhou."}]}
+{"messages": [{"role": "user", "content": "<image><image>What's the difference between these two images?"}, {"role": "assistant", "content": "The first one is a kitten, the second one is a puppy"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
+{"messages": [{"role": "system", "content": "You are a helpful and harmless assistant"}, {"role": "user", "content": "<image>What's in the image, <video>what's in the video?"}, {"role": "assistant", "content": "There's an elephant in the image, and a puppy running on the grass in the video"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+```
+
+Qwen3-VL's bbox output uses normalized 1000 relative coordinates. You can use the grounding dataset format provided by ms-swift, where the coordinates in "bbox" are absolute coordinates, and ms-swift will automatically convert absolute coordinates to normalized 1000 relative coordinates. For more information, please refer to the [Grounding Dataset Format Documentation](../Customization/Custom-dataset.md#grounding).
+
+```jsonl
+{"messages": [{"role": "user", "content": "<image>找到图像中的<ref-object>"}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
+```
+
+### Dense Models
+Below is a fine-tuning script for the `Qwen3-VL-4B-Instruct` model. We use mixed-modality data as a demo dataset; this example script has no practical value. Training memory usage is 2 * 21GiB, and training time is 12 minutes.
+
+```shell
+# 2 * 21GiB
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+    --model Qwen/Qwen3-VL-4B-Instruct \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+              'swift/VideoChatGPT:Generic#2000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --attn_impl flash_attn \
+    --padding_free true \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --freeze_vit true \
+    --freeze_aligner true \
+    --packing true \
+    --gradient_checkpointing true \
+    --vit_gradient_checkpointing false \
+    --gradient_accumulation_steps 2 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 4096 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --deepspeed zero2 \
+    --dataset_num_proc 4 \
+    --dataloader_num_workers 4
+```
+After training, we use the following script to perform inference on the validation set:
+
+```shell
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+CUDA_VISIBLE_DEVICES=0 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --max_new_tokens 2048 \
+    --load_data_args true
+```
+
+```
+--------------------------------------------------
+[QUERY] Using LaTeX to perform OCR on the image.
+[LABELS] 1 + \frac { 1 } { 1 ! } + \frac { 1 } { 2 ! } + \frac { 1 } { 3 ! } + \frac { 1 } { 4 ! }
+[RESPONSE] 1 + \frac { 1 } { 1 ! } + \frac { 1 } { 2 ! } + \frac { 1 } { 3 ! } + \frac { 1 } { 4 ! }
+--------------------------------------------------
+[QUERY] What color suit is the man wearing while playing the saxophone on stage?
+[LABELS] The man is wearing a black suit and white shirt while playing the saxophone on the red-floored stage.
+[RESPONSE] The man is wearing a black suit while playing the saxophone on stage.
+--------------------------------------------------
+...
+```
+
+### MoE Models
+Below is a fine-tuning script for the `Qwen3-VL-30B-A3B-Instruct` model. We use Megatron-SWIFT for single-machine full-parameter training. We still use mixed data for training; this example script has no practical value. Training requires 8 * 80GiB GPU memory, and training time is 20 minutes.
+
+For Megatron-SWIFT environment installation, please refer to the [Megatron-SWIFT Documentation](../Megatron-SWIFT/Quick-start.md). Megatron-SWIFT shares the template and dataset modules with ms-swift, so the custom dataset format and model-specific environment variables introduced earlier still apply.
+
+Convert HF format weights to Megatron format:
+```shell
+CUDA_VISIBLE_DEVICES=0,1 \
+swift export \
+    --model Qwen/Qwen3-VL-30B-A3B-Instruct \
+    --to_mcore true \
+    --torch_dtype bfloat16 \
+    --output_dir Qwen3-VL-30B-A3B-Instruct-mcore
+```
+The fine-tuning script is as follows. For adjusting training techniques and parallelism strategies, refer to the [Megatron-SWIFT Documentation](../Megatron-SWIFT/Quick-start.md#training-tips).
+
+```shell
+# 8 * 80GiB
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+OMP_NUM_THREADS=14 \
+NPROC_PER_NODE=8 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+megatron sft \
+    --load Qwen3-VL-30B-A3B-Instruct-mcore \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+              'swift/VideoChatGPT:Generic#2000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --moe_permute_fusion true \
+    --tensor_model_parallel_size 2 \
+    --expert_model_parallel_size 8 \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 1e-6 \
+    --micro_batch_size 1 \
+    --global_batch_size 4 \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --max_epochs 1 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-5 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-6 \
+    --save megatron_output/Qwen3-VL-30B-A3B-Instruct \
+    --eval_interval 500 \
+    --save_interval 500 \
+    --max_length 4096 \
+    --packing true \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --sequence_parallel true \
+    --moe_expert_capacity_factor 1 \
+    --optimizer_cpu_offload true \
+    --use_precision_aware_optimizer true \
+    --optimizer_offload_fraction 0.2 \
+    --attention_backend flash
+```
+Convert Megatron format weights to HF format:
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1 \
+swift export \
+    --mcore_adapters megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx \
+    --to_hf true \
+    --torch_dtype bfloat16 \
+    --output_dir megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf
+```
+- To use weights from a specific iteration, please modify the `latest_checkpointed_iteration.txt` file in the `megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx` directory.
+After training, we use the following script to perform inference on the validation set:
+
+```shell
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+swift infer \
+    --model megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf \
+    --stream true \
+    --max_new_tokens 2048 \
+    --load_data_args true
+```
+Use the following command to push the trained weights to ModelScope:
+
+```shell
+swift export \
+    --model output/vx-xxx/checkpoint-xxx \
+    --push_to_hub true \
+    --hub_model_id '<your-model-id>' \
+    --hub_token '<your-sdk-token>'
+```
\ No newline at end of file
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index d23afb3a99..053660334f 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -773,7 +773,7 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - MAX_NUM: Default is 12
 - INPUT_SIZE: Default is 448
 
-### internvl2, internvl2_phi3, internvl2_5, internvl3
+### internvl2, internvl2_phi3, internvl2_5, internvl3, internvl3_5
 For the meaning of the arguments, please refer to [here](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)
 - MAX_NUM: Default is 12
 - INPUT_SIZE: Default is 448
diff --git a/examples/models/qwen3_vl/mcore_full.sh b/examples/models/qwen3_vl/mcore_full.sh
new file mode 100644
index 0000000000..37c9bbbbec
--- /dev/null
+++ b/examples/models/qwen3_vl/mcore_full.sh
@@ -0,0 +1,47 @@
+# 8 * 80GiB
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+OMP_NUM_THREADS=14 \
+NPROC_PER_NODE=8 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+megatron sft \
+    --load Qwen3-VL-30B-A3B-Instruct-mcore \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+              'swift/VideoChatGPT:Generic#2000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --moe_permute_fusion true \
+    --tensor_model_parallel_size 2 \
+    --expert_model_parallel_size 8 \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 1e-6 \
+    --micro_batch_size 1 \
+    --global_batch_size 4 \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --max_epochs 1 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-5 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-6 \
+    --save megatron_output/Qwen3-VL-30B-A3B-Instruct \
+    --eval_interval 500 \
+    --save_interval 500 \
+    --max_length 4096 \
+    --packing true \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --sequence_parallel true \
+    --moe_expert_capacity_factor 1 \
+    --optimizer_cpu_offload true \
+    --use_precision_aware_optimizer true \
+    --optimizer_offload_fraction 0.2 \
+    --attention_backend flash
diff --git a/examples/models/qwen3_vl/mixed.sh b/examples/models/qwen3_vl/mixed.sh
new file mode 100644
index 0000000000..86ce216844
--- /dev/null
+++ b/examples/models/qwen3_vl/mixed.sh
@@ -0,0 +1,41 @@
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+    --model Qwen/Qwen3-VL-4B-Instruct \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#10000' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+              'swift/VideoChatGPT:Generic#2000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --attn_impl flash_attn \
+    --padding_free true \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --router_aux_loss_coef 1e-3 \
+    --freeze_vit true \
+    --freeze_aligner true \
+    --packing true \
+    --gradient_checkpointing true \
+    --vit_gradient_checkpointing false \
+    --gradient_accumulation_steps 2 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 4096 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --deepspeed zero2 \
+    --dataset_num_proc 4 \
+    --dataloader_num_workers 4
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
index cdf8900525..f41a8fddbf 100644
--- a/swift/trainers/mixin.py
+++ b/swift/trainers/mixin.py
@@ -761,8 +761,8 @@ def _prepare_gradient_checkpointing(self, model) -> None:
                         else:
                             vision_tower.gradient_checkpointing_disable()
                             vision_tower.disable_input_require_grads()
-                    except (NotImplementedError, AttributeError):
-                        pass
+                    except (NotImplementedError, AttributeError) as e:
+                        logger.warning(f'prepare gradient_checkpointing failed: {e}')
         # Avoid vit_gradient_checkpointing being overwritten by transformers.Trainer.gradient_checkpointing_enable.
         self.args.gradient_checkpointing = False
 

From 977ca6c8393aaf99080280c0384e1f0f491482ac Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 19 Oct 2025 23:52:05 +0800
Subject: [PATCH 02/11] update

---
 .../Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" | 1 +
 docs/source/index.rst                                            | 1 +
 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md           | 1 +
 docs/source_en/index.rst                                         | 1 +
 4 files changed, 4 insertions(+)

diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 3705d09e22..6cd0675f07 100644
--- "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -19,6 +19,7 @@ pip install "ms-swift>=3.9.1"
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from modelscope import snapshot_download
+from qwen_vl_utils import process_vision_info
 from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
 
 model_dir = snapshot_download('Qwen/Qwen3-VL-4B-Instruct')
diff --git a/docs/source/index.rst b/docs/source/index.rst
index af6129e628..e568c864d9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -56,6 +56,7 @@ Swift DOCUMENTATION
    BestPractices/GRPO多模态训练.md
    BestPractices/GRPO代码训练.md
    BestPractices/Qwen3最佳实践.md
+   BestPractices/Qwen3-VL最佳实践.md
    BestPractices/Embedding训练.md
    BestPractices/Reranker训练.md
    BestPractices/快速训练VL模型.md
diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
index 71ee1656be..e8d2850658 100644
--- a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -15,6 +15,7 @@ Inference using transformers:
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from modelscope import snapshot_download
+from qwen_vl_utils import process_vision_info
 from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
 
 model_dir = snapshot_download('Qwen/Qwen3-VL-4B-Instruct')
diff --git a/docs/source_en/index.rst b/docs/source_en/index.rst
index c561735643..dc11a3384e 100644
--- a/docs/source_en/index.rst
+++ b/docs/source_en/index.rst
@@ -58,6 +58,7 @@ Swift DOCUMENTATION
    BestPractices/GRPO-Multi-Modal-Training.md
    BestPractices/GRPO-Code-Training.md
    BestPractices/Qwen3-Best-Practice.md
+   BestPractices/Qwen3-VL-Best-Practice.md
    BestPractices/Embedding.md
    BestPractices/Reranker.md
    BestPractices/Rapidly-Training-VL-model.md

From 07b1d69e2fc46fdd56dd15d481cb1523c693f57e Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 19 Oct 2025 23:58:41 +0800
Subject: [PATCH 03/11] update

---
 ...346\234\200\344\275\263\345\256\236\350\267\265.md" | 10 +++++-----
 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md |  2 +-
 examples/models/qwen3_vl/mixed.sh                      |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 6cd0675f07..c38a11a146 100644
--- "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -3,7 +3,7 @@
 
 ## 环境准备
 
-在开始推理和训练之前，请确保您的环境已准备妥当。
+在开始推理和训练之前，请确保您的环境已准备就绪。
 
 ```shell
 pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14"
@@ -156,7 +156,7 @@ Overall, this is a sweet, lighthearted video that showcases the innocence and im
 
 ## 训练
 
-这里将介绍如何使用ms-swift与Megatron-SWIFT对Qwen3-VL进行训练。推荐Dense模型使用ms-swift（即transformers后端，更加方便简单），而Moe模型使用Megatron-SWIFT（即megatron后端，更快的训练速度，benchmark查看[这里](../Megatron-SWIFT/快速开始.md#benchmark)）。
+本文档将介绍如何使用ms-swift与Megatron-SWIFT训练Qwen3-VL。推荐Dense模型使用ms-swift（即transformers后端，更加方便简单），而Moe模型使用Megatron-SWIFT（即megatron后端，更快的训练速度，benchmark查看[这里](../Megatron-SWIFT/快速开始.md#benchmark)）。
 
 如果您需要自定义数据集微调模型，你可以将数据准备成以下格式，并在命令行中设置`--dataset train.jsonl --val_dataset val.jsonl`，其中验证集为可选。更多介绍请参考[多模态数据集文档](../Customization/自定义数据集.md#多模态)。
 ```jsonl
@@ -172,7 +172,7 @@ Qwen3-VL的bbox输出采用归一化1000的相对坐标。你可以使用ms-swif
 
 ### Dense模型
 
-以下提供对`Qwen3-VL-4B-Instruct`模型的微调脚本，我们使用混合模态数据作为Demo数据集，该示例脚本没有实用价值。训练显存为2 * 21GiB，训练时间为12分钟。
+以下提供对`Qwen3-VL-4B-Instruct`模型的微调脚本，我们使用混合模态数据作为Demo数据集，该示例脚本仅作为演示用途。训练显存为2 * 21GiB，训练时间为12分钟。
 
 ```shell
 # 2 * 21GiB
@@ -248,7 +248,7 @@ swift infer \
 ### Moe模型
 
 
-以下提供对`Qwen3-VL-30B-A3B-Instruct`模型的微调脚本，我们使用Megatron-SWIFT进行单机全参数训练。我们依旧采用混合数据进行训练，该示例脚本没有实用价值。训练所需显存资源为8 * 80GiB，训练时间为20分钟。
+以下提供对`Qwen3-VL-30B-A3B-Instruct`模型的微调脚本，我们使用Megatron-SWIFT进行单机全参数训练。我们同样采用混合数据进行训练，该示例脚本仅作为演示用途。训练所需显存资源为8 * 80GiB，训练时间为20分钟。
 
 关于Megatron-SWIFT的环境安装，请参考[Megatron-SWIFT文档](../Megatron-SWIFT/快速开始.md)。Megatron-SWIFT与ms-swift共用template和dataset模块，因此前面介绍的自定义数据集格式和模型特有环境变量依旧生效。
 
@@ -322,7 +322,7 @@ swift export \
     --torch_dtype bfloat16 \
     --output_dir megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf
 ```
-- 若要调整使用对应iter的权重，请修改`megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx`目录下的`latest_checkpointed_iteration.txt`文件。
+- 若要调整使用对应迭代次数（iter）的权重，请修改`megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx`目录下的`latest_checkpointed_iteration.txt`文件。
 
 
 训练结束后，我们使用以下脚本对验证集进行推理：
diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
index e8d2850658..77f975687e 100644
--- a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -340,4 +340,4 @@ swift export \
     --push_to_hub true \
     --hub_model_id '<your-model-id>' \
     --hub_token '<your-sdk-token>'
-```
\ No newline at end of file
+```
diff --git a/examples/models/qwen3_vl/mixed.sh b/examples/models/qwen3_vl/mixed.sh
index 86ce216844..b5e3e55f40 100644
--- a/examples/models/qwen3_vl/mixed.sh
+++ b/examples/models/qwen3_vl/mixed.sh
@@ -1,3 +1,4 @@
+# 2 * 21GiB
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 IMAGE_MAX_TOKEN_NUM=1024 \
 VIDEO_MAX_TOKEN_NUM=128 \
@@ -22,7 +23,6 @@ swift sft \
     --lora_rank 8 \
     --lora_alpha 32 \
     --target_modules all-linear \
-    --router_aux_loss_coef 1e-3 \
     --freeze_vit true \
     --freeze_aligner true \
     --packing true \

From e8bba54bb694f802106594249a53b6a2ac10e9c0 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 19 Oct 2025 23:59:57 +0800
Subject: [PATCH 04/11] fix

---
 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
index 77f975687e..c07059b37f 100644
--- a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -166,7 +166,7 @@ If you need to fine-tune the model with a custom dataset, you can prepare the da
 Qwen3-VL's bbox output uses normalized 1000 relative coordinates. You can use the grounding dataset format provided by ms-swift, where the coordinates in "bbox" are absolute coordinates, and ms-swift will automatically convert absolute coordinates to normalized 1000 relative coordinates. For more information, please refer to the [Grounding Dataset Format Documentation](../Customization/Custom-dataset.md#grounding).
 
 ```jsonl
-{"messages": [{"role": "user", "content": "<image>找到图像中的<ref-object>"}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
+{"messages": [{"role": "user", "content": "<image>Find <ref-object> in the image."}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
 ```
 
 ### Dense Models

From 1015b28abd0108468ddefe93604fb7a25bed3473 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 20 Oct 2025 00:02:58 +0800
Subject: [PATCH 05/11] fix

---
 ...Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" | 2 +-
 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
index c38a11a146..69143d1401 100644
--- "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -317,7 +317,7 @@ Megatron格式权重转为Hf格式：
 ```shell
 CUDA_VISIBLE_DEVICES=0,1 \
 swift export \
-    --mcore_adapters megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx \
+    --mcore_model megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx \
     --to_hf true \
     --torch_dtype bfloat16 \
     --output_dir megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf
diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
index c07059b37f..54aa0a6708 100644
--- a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -313,7 +313,7 @@ Convert Megatron format weights to HF format:
 ```shell
 CUDA_VISIBLE_DEVICES=0,1 \
 swift export \
-    --mcore_adapters megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx \
+    --mcore_model megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx \
     --to_hf true \
     --torch_dtype bfloat16 \
     --output_dir megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf

From fed18364a5ecb843f8b4ddef94f2f6985b14257f Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 20 Oct 2025 00:10:40 +0800
Subject: [PATCH 06/11] update

---
 ...00\344\275\263\345\256\236\350\267\265.md" | 19 ++++++++++---------
 .../BestPractices/Qwen3-VL-Best-Practice.md   |  5 +++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 69143d1401..a8473a1bf5 100644
--- "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -9,12 +9,13 @@
 pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14"
 
 pip install "ms-swift>=3.9.1"
+# pip install vllm>="0.11.0"  # 若使用vllm推理后端进行推理
 ```
 
 
 ## 推理
 
-使用transformers推理：
+使用 transformers 推理：
 ```python
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
@@ -69,7 +70,7 @@ print(output_text[0])
 # 'A baby wearing glasses sits on a bed, engrossed in reading a book. The baby turns the pages with both hands, occasionally looking up and smiling. The room is cozy, with a crib in the background and clothes scattered around. The baby’s focus and curiosity are evident as they explore the book, creating a heartwarming scene of early learning and discovery.'
 ```
 
-使用ms-swift的PtEngine进行推理：
+使用 ms-swift 的 `PtEngine` 进行推理：
 ```python
 import os
 os.environ['SWIFT_DEBUG'] = '1'
@@ -101,7 +102,7 @@ print()
 
 使用命令行推理：
 ```shell
-CUDA_VISIBLE_DEVICES=4 \
+CUDA_VISIBLE_DEVICES=0 \
 IMAGE_MAX_TOKEN_NUM=1024 \
 VIDEO_MAX_TOKEN_NUM=128 \
 FPS_MAX_FRAMES=16 \
@@ -151,12 +152,12 @@ Here’s a breakdown of what unfolds:
 Overall, this is a sweet, lighthearted video that showcases the innocence and imagination of early childhood. The child’s engagement with the book, combined with their glasses and playful demeanor, creates a delightful and memorable scene.
 ```
 
-- 其中特定模型参数，例如`VIDEO_MAX_TOKEN_NUM`等环境变量的含义参考[命令行参数文档](../Instruction/命令行参数.md#qwen3_vl)。
+- 其中特定模型参数，例如 `VIDEO_MAX_TOKEN_NUM` 等环境变量的含义参考[命令行参数文档](../Instruction/命令行参数.md#qwen3_vl)。
 
 
 ## 训练
 
-本文档将介绍如何使用ms-swift与Megatron-SWIFT训练Qwen3-VL。推荐Dense模型使用ms-swift（即transformers后端，更加方便简单），而Moe模型使用Megatron-SWIFT（即megatron后端，更快的训练速度，benchmark查看[这里](../Megatron-SWIFT/快速开始.md#benchmark)）。
+本文档将介绍如何使用 ms-swift 与 Megatron-SWIFT 训练 Qwen3-VL。推荐 Dense 模型使用 ms-swift（即 transformers 后端，更加方便简单），而 Moe 模型使用 Megatron-SWIFT（即 megatron 后端，更快的训练速度，benchmark查看[这里](../Megatron-SWIFT/快速开始.md#benchmark)）。
 
 如果您需要自定义数据集微调模型，你可以将数据准备成以下格式，并在命令行中设置`--dataset train.jsonl --val_dataset val.jsonl`，其中验证集为可选。更多介绍请参考[多模态数据集文档](../Customization/自定义数据集.md#多模态)。
 ```jsonl
@@ -165,7 +166,7 @@ Overall, this is a sweet, lighthearted video that showcases the innocence and im
 {"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "<image>图片中是什么，<video>视频中是什么"}, {"role": "assistant", "content": "图片中是一个大象，视频中是一只小狗在草地上奔跑"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
 ```
 
-Qwen3-VL的bbox输出采用归一化1000的相对坐标。你可以使用ms-swift提供的grounding数据集格式，其中"bbox"中的坐标为绝对坐标，ms-swift会自动将绝对坐标转为归一化1000的相对坐标。更多信息请参考[grounding数据集格式文档](../Customization/自定义数据集.md#grounding)。
+Qwen3-VL的bbox输出采用归一化1000的相对坐标。你可以使用 ms-swift 提供的 grounding 数据集格式，其中"bbox"中的坐标为绝对坐标，ms-swift 会自动将绝对坐标转为归一化1000的相对坐标。更多信息请参考[grounding数据集格式文档](../Customization/自定义数据集.md#grounding)。
 ```jsonl
 {"messages": [{"role": "user", "content": "<image>找到图像中的<ref-object>"}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
 ```
@@ -248,9 +249,9 @@ swift infer \
 ### Moe模型
 
 
-以下提供对`Qwen3-VL-30B-A3B-Instruct`模型的微调脚本，我们使用Megatron-SWIFT进行单机全参数训练。我们同样采用混合数据进行训练，该示例脚本仅作为演示用途。训练所需显存资源为8 * 80GiB，训练时间为20分钟。
+以下提供对`Qwen3-VL-30B-A3B-Instruct`模型的微调脚本，我们使用 Megatron-SWIFT 进行单机全参数训练。我们同样采用混合数据进行训练，该示例脚本仅作为演示用途。训练所需显存资源为8 * 80GiB，训练时间为20分钟。
 
-关于Megatron-SWIFT的环境安装，请参考[Megatron-SWIFT文档](../Megatron-SWIFT/快速开始.md)。Megatron-SWIFT与ms-swift共用template和dataset模块，因此前面介绍的自定义数据集格式和模型特有环境变量依旧生效。
+关于 Megatron-SWIFT 的环境安装，请参考[Megatron-SWIFT文档](../Megatron-SWIFT/快速开始.md)。Megatron-SWIFT 与 ms-swift 共用 template 和 dataset 模块，因此前面介绍的自定义数据集格式和模型特有环境变量依旧生效。
 
 HF格式权重转为Megatron格式：
 ```shell
@@ -339,7 +340,7 @@ swift infer \
 ```
 
 
-使用以下命令将训练权重推送到Modelscope：
+使用以下命令将训练权重推送到 Modelscope：
 ```shell
 swift export \
     --model output/vx-xxx/checkpoint-xxx \
diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
index 54aa0a6708..f140626748 100644
--- a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -6,6 +6,7 @@ Before starting inference and training, please ensure your environment is proper
 pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14"
 
 pip install "ms-swift>=3.9.1"
+# pip install vllm>="0.11.0"  # If using the vLLM inference backend for inference
 ```
 
 ## Inference
@@ -99,7 +100,7 @@ print()
 Inference using command line:
 
 ```shell
-CUDA_VISIBLE_DEVICES=4 \
+CUDA_VISIBLE_DEVICES=0 \
 IMAGE_MAX_TOKEN_NUM=1024 \
 VIDEO_MAX_TOKEN_NUM=128 \
 FPS_MAX_FRAMES=16 \
@@ -166,7 +167,7 @@ If you need to fine-tune the model with a custom dataset, you can prepare the da
 Qwen3-VL's bbox output uses normalized 1000 relative coordinates. You can use the grounding dataset format provided by ms-swift, where the coordinates in "bbox" are absolute coordinates, and ms-swift will automatically convert absolute coordinates to normalized 1000 relative coordinates. For more information, please refer to the [Grounding Dataset Format Documentation](../Customization/Custom-dataset.md#grounding).
 
 ```jsonl
-{"messages": [{"role": "user", "content": "<image>Find <ref-object> in the image."}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["羊", "羊", "羊"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
+{"messages": [{"role": "user", "content": "<image>Locate the <ref-object> in the image"}, {"role": "assistant", "content": "[\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n\t{\"bbox_2d\": <bbox>, \"label\": \"<ref-object>\"}\n]"}], "images": ["cat.png"], "objects": {"ref": ["sheep", "sheep", "sheep"], "bbox": [[90.9, 160.8, 135, 212.8], [360.9, 480.8, 495, 532.8]]}}
 ```
 
 ### Dense Models

From 6ccab6824770ec3afb579c8255484a05bc2efe51 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 20 Oct 2025 00:14:01 +0800
Subject: [PATCH 07/11] update

---
 ...wen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" | 3 ++-
 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md         | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
index a8473a1bf5..556b6c50b5 100644
--- "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -329,11 +329,12 @@ swift export \
 训练结束后，我们使用以下脚本对验证集进行推理：
 ```shell
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+CUDA_VISIBLE_DEVICES=0 \
 IMAGE_MAX_TOKEN_NUM=1024 \
 VIDEO_MAX_TOKEN_NUM=128 \
 FPS_MAX_FRAMES=16 \
 swift infer \
-    --model megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf
+    --model megatron_output/Qwen3-VL-30B-A3B-Instruct/vx-xxx-hf \
     --stream true \
     --max_new_tokens 2048 \
     --load_data_args true
diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
index f140626748..fc17b3b4ba 100644
--- a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -324,6 +324,7 @@ After training, we use the following script to perform inference on the validati
 
 ```shell
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+CUDA_VISIBLE_DEVICES=0 \
 IMAGE_MAX_TOKEN_NUM=1024 \
 VIDEO_MAX_TOKEN_NUM=128 \
 FPS_MAX_FRAMES=16 \

From 6683764cbe54f56b3c9134a5c71818ba54705981 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 20 Oct 2025 00:22:57 +0800
Subject: [PATCH 08/11] update

---
 ...Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" | 2 +-
 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 556b6c50b5..ba99f79373 100644
--- "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -307,7 +307,7 @@ megatron sft \
     --no_save_optim true \
     --no_save_rng true \
     --sequence_parallel true \
-    --moe_expert_capacity_factor 1 \
+    --moe_expert_capacity_factor 2 \
     --optimizer_cpu_offload true \
     --use_precision_aware_optimizer true \
     --optimizer_offload_fraction 0.2 \
diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
index fc17b3b4ba..292cea93a8 100644
--- a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -303,7 +303,7 @@ megatron sft \
     --no_save_optim true \
     --no_save_rng true \
     --sequence_parallel true \
-    --moe_expert_capacity_factor 1 \
+    --moe_expert_capacity_factor 2 \
     --optimizer_cpu_offload true \
     --use_precision_aware_optimizer true \
     --optimizer_offload_fraction 0.2 \

From 92876f8801071951e7091400fc9e25429c9c7429 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 20 Oct 2025 00:23:07 +0800
Subject: [PATCH 09/11] update

---
 examples/models/qwen3_vl/mcore_full.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/qwen3_vl/mcore_full.sh b/examples/models/qwen3_vl/mcore_full.sh
index 37c9bbbbec..f1877be83a 100644
--- a/examples/models/qwen3_vl/mcore_full.sh
+++ b/examples/models/qwen3_vl/mcore_full.sh
@@ -40,7 +40,7 @@ megatron sft \
     --no_save_optim true \
     --no_save_rng true \
     --sequence_parallel true \
-    --moe_expert_capacity_factor 1 \
+    --moe_expert_capacity_factor 2 \
     --optimizer_cpu_offload true \
     --use_precision_aware_optimizer true \
     --optimizer_offload_fraction 0.2 \

From e7c026e7e3c3e88bd4a717566a1fa718c211274e Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 20 Oct 2025 00:25:35 +0800
Subject: [PATCH 10/11] update

---
 ...\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 2 +-
 docs/source_en/Megatron-SWIFT/Command-line-parameters.md        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git "a/docs/source/Megatron-SWIFT/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Megatron-SWIFT/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 639a3f125c..6ab22283d1 100644
--- "a/docs/source/Megatron-SWIFT/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Megatron-SWIFT/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -196,7 +196,7 @@
   - 注意：在"ms-swift<3.7.1"，其默认为None，自动从config.json读取。
 - moe_z_loss_coeff: z-loss 的缩放系数。默认为None。
 - 🔥moe_shared_expert_overlap: 启用共享专家计算与调度器通信之间的重叠。如果不启用此选项，共享专家将在路由专家之后执行。仅在设置了`moe_shared_expert_intermediate_size`时有效。默认为False。
-- 🔥moe_expert_capacity_factor: 每个专家的容量因子，None表示不会丢弃任何token。默认为None。通过设置 `--moe_expert_capacity_factor`，超出专家容量的 token 会基于其被选中的概率被丢弃。可以**令训练负载均匀，提升训练速度**（例如设置为1）。
+- 🔥moe_expert_capacity_factor: 每个专家的容量因子，None表示不会丢弃任何token。默认为None。通过设置 `--moe_expert_capacity_factor`，超出专家容量的 token 会基于其被选中的概率被丢弃。可以**令训练负载均匀，提升训练速度**（例如设置为1或2）。
 - moe_pad_expert_input_to_capacity: 对每个专家（expert）的输入进行填充，使其长度与专家容量（expert capacity length）对齐，默认为False。该操作仅在设置了 `--moe_expert_capacity_factor` 参数后才生效。
 - moe_token_drop_policy: 可选为'probs', 'position'。默认为'probs'。
 
diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
index a40a3e6ea2..163990589c 100644
--- a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
+++ b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
@@ -207,7 +207,7 @@ For guidance on selecting parallelization strategies, please refer to the [Train
   - Note: In ms-swift versions earlier than 3.7.1, the default is None and the value is automatically loaded from config.json.
 - moe_z_loss_coeff: Scaling coefficient for z-loss. Default is None.
 - 🔥moe_shared_expert_overlap: Enables overlap between shared expert computation and the dispatcher. If not enabled, shared expert computation will be performed after routing experts. Only effective when `moe_shared_expert_intermediate_size` is set. Default is False.
-- 🔥moe_expert_capacity_factor: Capacity factor for each expert. `None` means no tokens will be dropped. Default is `None`. When `--moe_expert_capacity_factor` is set, tokens exceeding an expert’s capacity will be dropped based on their selection probability. This can **balance the training load and improve training speed** (for example, set it to 1.).
+- 🔥moe_expert_capacity_factor: Capacity factor for each expert. `None` means no tokens will be dropped. Default is `None`. When `--moe_expert_capacity_factor` is set, tokens exceeding an expert’s capacity will be dropped based on their selection probability. This can **balance the training load and improve training speed** (for example, set it to 1. or 2.).
 - moe_pad_expert_input_to_capacity: Pad the input of each expert so that its length aligns with the expert capacity length. Default is `False`. This option only takes effect if `--moe_expert_capacity_factor` is set.
 - moe_token_drop_policy: Options are 'probs' and 'position'. Default is 'probs'.
 

From ba75656490fd2aa5bbbbeb3f1e3865112ed3a3d1 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 20 Oct 2025 00:28:43 +0800
Subject: [PATCH 11/11] fix

---
 ...Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" | 2 +-
 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
index ba99f79373..9a0528c890 100644
--- "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -9,7 +9,7 @@
 pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14"
 
 pip install "ms-swift>=3.9.1"
-# pip install vllm>="0.11.0"  # 若使用vllm推理后端进行推理
+# pip install "vllm>=0.11.0"  # 若使用vllm推理后端进行推理
 ```
 
 
diff --git a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
index 292cea93a8..78308c0897 100644
--- a/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
+++ b/docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md
@@ -6,7 +6,7 @@ Before starting inference and training, please ensure your environment is proper
 pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14"
 
 pip install "ms-swift>=3.9.1"
-# pip install vllm>="0.11.0"  # If using the vLLM inference backend for inference
+# pip install "vllm>=0.11.0"  # If using the vLLM inference backend for inference
 ```
 
 ## Inference