From 3f2faa9094de860e9a86ca2d0ac99694d500dd3b Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 26 Sep 2024 15:57:12 +0800 Subject: [PATCH 1/2] fix --- swift/llm/sft.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/swift/llm/sft.py b/swift/llm/sft.py index c9bea3528d..113fd365d0 100644 --- a/swift/llm/sft.py +++ b/swift/llm/sft.py @@ -270,7 +270,10 @@ def prepare_model_template_train(args, msg: Optional[Dict[str, Any]] = None): for vision_tower_name in mllm_arch.vision_tower: vision_tower = deep_getattr(model, vision_tower_name) if hasattr(vision_tower, 'enable_input_require_grads'): - vision_tower.enable_input_require_grads() + try: + vision_tower.enable_input_require_grads() + except NotImplementedError: + pass if use_torchacc(): model.config.use_cache = False From 764a70bcf300771dc243a06b891df0130a62f626 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 26 Sep 2024 16:38:29 +0800 Subject: [PATCH 2/2] update --- ...14\346\225\260\346\215\256\351\233\206.md" | 30 +++++++++---------- .../Instruction/Supported-models-datasets.md | 30 +++++++++---------- swift/llm/utils/model.py | 8 ++--- 3 files changed, 34 insertions(+), 34 deletions(-) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index eabde47c86..7bdfac1532 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -426,21 +426,21 @@ |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| -|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| -|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| -|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| -|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| -|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| -|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| -|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| -|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| -|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| -|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| -|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| -|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| -|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| -|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| -|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| +|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| +|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| +|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| +|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| +|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| +|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| +|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| +|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| +|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| +|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| +|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| +|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| +|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| +|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| +|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|✘|✘|✘|✘|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| |llama3_2-11b-visiont|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)| |llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)| diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index afe28bca02..5795b99b34 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -426,21 +426,21 @@ The table below introcudes all models supported by SWIFT: |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| -|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| -|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| -|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| -|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| -|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| -|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| -|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| -|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| -|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| -|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| -|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| -|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| -|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| -|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| -|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| +|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| +|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| +|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| +|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| +|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| +|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| +|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| +|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| +|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| +|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| +|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| +|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| +|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| +|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| +|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|✘|✘|✘|✘|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| |llama3_2-11b-visiont|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)| |llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)| diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 3799aee022..c0d42d6552 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -3681,7 +3681,7 @@ def _read_from_stream(container: 'av.container.Container', start_offset: float, support_flash_attn=True, support_vllm=True, placeholder_tokens=['<|image_pad|>', '<|video_pad|>'], - requires=['transformers>=4.45', 'qwen_vl_utils'], + requires=['transformers>=4.45.dev.0', 'qwen_vl_utils'], tags=['multi-modal', 'vision', 'video'], hf_model_id=f'Qwen/Qwen2-VL-{model_size}') register_model( @@ -3693,7 +3693,7 @@ def _read_from_stream(container: 'av.container.Container', start_offset: float, support_flash_attn=True, support_vllm=True, placeholder_tokens=['<|image_pad|>', '<|video_pad|>'], - requires=['transformers>=4.45', 'qwen_vl_utils'], # 'pyav' + requires=['transformers>=4.45.dev.0', 'qwen_vl_utils'], # 'pyav' tags=['multi-modal', 'vision', 'video'], hf_model_id=f'Qwen/Qwen2-VL-{model_size}-Instruct') for quant_bits in [4, 8]: @@ -3708,7 +3708,7 @@ def _read_from_stream(container: 'av.container.Container', start_offset: float, support_flash_attn=True, support_vllm=True, placeholder_tokens=['<|image_pad|>', '<|video_pad|>'], - requires=['transformers>=4.45', 'qwen_vl_utils', 'auto_gptq>=0.5'], + requires=['transformers>=4.45.dev.0', 'qwen_vl_utils', 'auto_gptq>=0.5'], tags=['multi-modal', 'vision', 'video'], function_kwargs={'gptq_bits': quant_bits}, torch_dtype=torch.float16, @@ -3723,7 +3723,7 @@ def _read_from_stream(container: 'av.container.Container', start_offset: float, support_flash_attn=True, support_vllm=True, placeholder_tokens=['<|image_pad|>', '<|video_pad|>'], - requires=['transformers>=4.45', 'qwen_vl_utils', 'autoawq'], + requires=['transformers>=4.45.dev.0', 'qwen_vl_utils', 'autoawq'], tags=['multi-modal', 'vision', 'video'], function_kwargs={'is_awq': True}, torch_dtype=torch.float16,