From 646e87e519b416658901b2b4e4c3d6120fba63e3 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 19 Oct 2025 23:50:19 +0800 Subject: [PATCH 01/11] update qwen3_vl docs --- ...00\344\275\263\345\256\236\350\267\265.md" | 348 ++++++++++++++++++ ...44\350\241\214\345\217\202\346\225\260.md" | 2 +- .../BestPractices/Qwen3-VL-Best-Practice.md | 342 +++++++++++++++++ .../Instruction/Command-line-parameters.md | 2 +- examples/models/qwen3_vl/mcore_full.sh | 47 +++ examples/models/qwen3_vl/mixed.sh | 41 +++ swift/trainers/mixin.py | 4 +- 7 files changed, 782 insertions(+), 4 deletions(-) create mode 100644 "docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" create mode 100644 docs/source_en/BestPractices/Qwen3-VL-Best-Practice.md create mode 100644 examples/models/qwen3_vl/mcore_full.sh create mode 100644 examples/models/qwen3_vl/mixed.sh diff --git "a/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" new file mode 100644 index 0000000000..3705d09e22 --- /dev/null +++ "b/docs/source/BestPractices/Qwen3-VL\346\234\200\344\275\263\345\256\236\350\267\265.md" @@ -0,0 +1,348 @@ + +# Qwen3-VL最佳实践 + +## 环境准备 + +在开始推理和训练之前,请确保您的环境已准备妥当。 + +```shell +pip install "transformers>=4.57" "qwen_vl_utils>=0.0.14" + +pip install "ms-swift>=3.9.1" +``` + + +## 推理 + +使用transformers推理: +```python +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +from modelscope import snapshot_download +from transformers import Qwen3VLForConditionalGeneration, AutoProcessor + +model_dir = snapshot_download('Qwen/Qwen3-VL-4B-Instruct') + +model = Qwen3VLForConditionalGeneration.from_pretrained( + model_dir, dtype="auto", device_map="auto", + # attn_implementation='flash_attention_2', +) + +processor = AutoProcessor.from_pretrained(model_dir) + +messages = [ + { + "role": "user", + "content": [ + { + "type": "video", + "video": "https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4", + "max_pixels": 128*32*32, + "max_frames": 16, + }, + {"type": "text", "text": "Describe this video."}, + ], + } +] + +text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True, + image_patch_size= 16, + return_video_metadata=True) +if video_inputs is not None: + video_inputs, video_metadatas = zip(*video_inputs) + video_inputs, video_metadatas = list(video_inputs), list(video_metadatas) +else: + video_metadatas = None +inputs = processor(text=[text], images=image_inputs, videos=video_inputs, video_metadata=video_metadatas, **video_kwargs, do_resize=False, return_tensors="pt") +inputs = inputs.to('cuda') + +generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False) +generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) +] +output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False +) +print(output_text[0]) +# 'A baby wearing glasses sits on a bed, engrossed in reading a book. The baby turns the pages with both hands, occasionally looking up and smiling. The room is cozy, with a crib in the background and clothes scattered around. The baby’s focus and curiosity are evident as they explore the book, creating a heartwarming scene of early learning and discovery.' +``` + +使用ms-swift的PtEngine进行推理: +```python +import os +os.environ['SWIFT_DEBUG'] = '1' +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +os.environ['VIDEO_MAX_TOKEN_NUM'] = '128' +os.environ['FPS_MAX_FRAMES'] = '16' + + +from swift.llm import PtEngine, InferRequest, RequestConfig +engine = PtEngine('Qwen/Qwen3-VL-4B-Instruct', attn_impl='flash_attention_2') +infer_request = InferRequest(messages=[{ + "role": "user", + "content": '