From 618eaaafb5afc4112e822a524cfe732229850c5e Mon Sep 17 00:00:00 2001 From: DaozeZhang Date: Tue, 10 Sep 2024 16:59:10 +0800 Subject: [PATCH 1/7] add transformers in gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 9b56a91519..493618bced 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,7 @@ output/ *.out benchmarks/ eval_outputs/ +transformers/ # Pytorch *.pth From 8fd40515386ebed250d400c3a58c6e1c9c47b05a Mon Sep 17 00:00:00 2001 From: DaozeZhang Date: Wed, 11 Sep 2024 18:57:16 +0800 Subject: [PATCH 2/7] fix a typo bug in text-caps --- swift/llm/utils/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index fc27c35eb6..75e6784516 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -1430,7 +1430,7 @@ def preprocess(row): 'swift/TextCaps', [], preprocess_func=preprocess_text_caps, get_function=get_dataset_from_repo, - split=['train', 'val'], + split=['train', 'validation'], hf_dataset_id='HuggingFaceM4/TextCaps', huge_dataset=True, tags=['multi-modal', 'en', 'caption', 'quality']) From 66c040a1d42aaab995c5472898e2a9d1d1fb85b7 Mon Sep 17 00:00:00 2001 From: DaozeZhang Date: Fri, 20 Sep 2024 10:00:43 +0800 Subject: [PATCH 3/7] support mistral-small-2409 --- swift/llm/utils/model.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 35d5350c23..eeb026baeb 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -431,6 +431,7 @@ class ModelType: mistral_nemo_base_2407 = 'mistral-nemo-base-2407' mistral_nemo_instruct_2407 = 'mistral-nemo-instruct-2407' mistral_large_instruct_2407 = 'mistral-large-instruct-2407' + mistral_small_instruct_2409 = 'mistral-small-instruct-2409' mixtral_moe_7b = 'mixtral-moe-7b' mixtral_moe_7b_instruct = 'mixtral-moe-7b-instruct' mixtral_moe_7b_aqlm_2bit_1x16 = 'mixtral-moe-7b-aqlm-2bit-1x16' # aqlm @@ -2570,6 +2571,16 @@ def get_model_tokenizer_glm4v(model_dir: str, support_flash_attn=True, support_vllm=True, hf_model_id='mistralai/Mistral-Large-Instruct-2407') +@register_model( + ModelType.mistral_small_instruct_2409, + 'AI-ModelScope/Mistral-Small-Instruct-2409', + LoRATM.llama, + TemplateType.mistral_nemo, + requires=['transformers>=4.43'], + ignore_file_pattern=['^consolidated'], + support_flash_attn=True, + support_vllm=True, + hf_model_id='mistralai/Mistral-Small-Instruct-2409') @register_model( ModelType.mistral_nemo_instruct_2407, 'AI-ModelScope/Mistral-Nemo-Instruct-2407', From afb38021dce236c2d1657189b951254501b62ab5 Mon Sep 17 00:00:00 2001 From: DaozeZhang Date: Fri, 20 Sep 2024 10:04:42 +0800 Subject: [PATCH 4/7] add .run to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 493618bced..d6764baebd 100644 --- a/.gitignore +++ b/.gitignore @@ -109,6 +109,7 @@ venv.bak/ .vscode .idea +.run # custom *.pkl From de8adddc7bb6e3cc7f082c1435a31c0d26190c7f Mon Sep 17 00:00:00 2001 From: DaozeZhang Date: Fri, 20 Sep 2024 10:10:25 +0800 Subject: [PATCH 5/7] add the doc of mistral-small-2409 --- ...14\346\225\260\346\215\256\351\233\206.md" | 86 ++++--------------- .../Instruction/Supported-models-datasets.md | 86 ++++--------------- 2 files changed, 34 insertions(+), 138 deletions(-) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index a374d272a4..bfbae085da 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -109,51 +109,6 @@ |qwen2-math-7b-instruct|[qwen/Qwen2-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-7B-Instruct)| |qwen2-math-72b|[qwen/Qwen2-Math-72B](https://modelscope.cn/models/qwen/Qwen2-Math-72B/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-72B](https://huggingface.co/Qwen/Qwen2-Math-72B)| |qwen2-math-72b-instruct|[qwen/Qwen2-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-72B-Instruct)| -|qwen2_5-0_5b|[qwen/Qwen2.5-0.5B](https://modelscope.cn/models/qwen/Qwen2.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B)| -|qwen2_5-1_5b|[qwen/Qwen2.5-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B)| -|qwen2_5-3b|[qwen/Qwen2.5-3B](https://modelscope.cn/models/qwen/Qwen2.5-3B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B)| -|qwen2_5-7b|[qwen/Qwen2.5-7B](https://modelscope.cn/models/qwen/Qwen2.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B)| -|qwen2_5-14b|[qwen/Qwen2.5-14B](https://modelscope.cn/models/qwen/Qwen2.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B)| -|qwen2_5-32b|[qwen/Qwen2.5-32B](https://modelscope.cn/models/qwen/Qwen2.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B)| -|qwen2_5-72b|[qwen/Qwen2.5-72B](https://modelscope.cn/models/qwen/Qwen2.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B](https://huggingface.co/Qwen/Qwen2.5-72B)| -|qwen2_5-0_5b-instruct|[qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)| -|qwen2_5-1_5b-instruct|[qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)| -|qwen2_5-3b-instruct|[qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)| -|qwen2_5-7b-instruct|[qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)| -|qwen2_5-14b-instruct|[qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)| -|qwen2_5-32b-instruct|[qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)| -|qwen2_5-72b-instruct|[qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)| -|qwen2_5-0_5b-instruct-gptq-int4|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)| -|qwen2_5-1_5b-instruct-gptq-int4|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)| -|qwen2_5-3b-instruct-gptq-int4|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)| -|qwen2_5-7b-instruct-gptq-int4|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)| -|qwen2_5-14b-instruct-gptq-int4|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)| -|qwen2_5-32b-instruct-gptq-int4|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)| -|qwen2_5-72b-instruct-gptq-int4|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)| -|qwen2_5-0_5b-instruct-gptq-int8|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)| -|qwen2_5-1_5b-instruct-gptq-int8|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)| -|qwen2_5-3b-instruct-gptq-int8|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)| -|qwen2_5-7b-instruct-gptq-int8|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)| -|qwen2_5-14b-instruct-gptq-int8|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)| -|qwen2_5-32b-instruct-gptq-int8|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)| -|qwen2_5-72b-instruct-gptq-int8|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)| -|qwen2_5-0_5b-instruct-awq|[qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)| -|qwen2_5-1_5b-instruct-awq|[qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)| -|qwen2_5-3b-instruct-awq|[qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)| -|qwen2_5-7b-instruct-awq|[qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)| -|qwen2_5-14b-instruct-awq|[qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)| -|qwen2_5-32b-instruct-awq|[qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)| -|qwen2_5-72b-instruct-awq|[qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)| -|qwen2_5-math-1_5b|[qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)| -|qwen2_5-math-7b|[qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)| -|qwen2_5-math-72b|[qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)| -|qwen2_5-math-1_5b-instruct|[qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)| -|qwen2_5-math-7b-instruct|[qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)| -|qwen2_5-math-72b-instruct|[qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)| -|qwen2_5-coder-1_5b|[qwen/Qwen2.5-Coder-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B)| -|qwen2_5-coder-1_5b-instruct|[qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)| -|qwen2_5-coder-7b|[qwen/Qwen2.5-Coder-7B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B](https://huggingface.co/Qwen/Qwen2.5-Coder-7B)| -|qwen2_5-coder-7b-instruct|[qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)| |chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)| |chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)| |chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)| @@ -391,7 +346,7 @@ |phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|query_key_value|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)| |phi3-medium-128k-instruct|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct/summary)|qkv_proj|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)| |phi3_5-mini-instruct|[LLM-Research/Phi-3.5-mini-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-mini-instruct/summary)|qkv_proj|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3.5-mini-instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct)| -|phi3_5-moe-instruct|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct/summary)|q_proj, k_proj, v_proj|phi3|✔|✔|✘|✘|transformers>=4.36|moe|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)| +|phi3_5-moe-instruct|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct/summary)|q_proj, k_proj, v_proj|phi3|✔|✘|✘|✘|transformers>=4.36|moe|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)| |mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)| |mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)| |mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)| @@ -414,28 +369,21 @@ ### 多模态大模型 | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support vLLM | Support LMDeploy | Support Megatron | Requires | Tags | HF Model ID | | --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | ---------------- | ---------------- | -------- | ---- | ----------- | -|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl-generation|✔|✔|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| -|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| -|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| +|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl-generation|✔|✘|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| +|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✘|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| +|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✘|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| -|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| -|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| -|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| -|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| -|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| -|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| -|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| -|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| -|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| -|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| -|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| -|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| -|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| -|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| -|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| +|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| +|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| +|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| +|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| +|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| +|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| +|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| +|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|✘|✘|✘|✘|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| |idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)| |llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|✔|✔|✘|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| @@ -475,11 +423,11 @@ |internvl2-26b|[OpenGVLab/InternVL2-26B](https://modelscope.cn/models/OpenGVLab/InternVL2-26B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)| |internvl2-40b|[OpenGVLab/InternVL2-40B](https://modelscope.cn/models/OpenGVLab/InternVL2-40B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B)| |internvl2-llama3-76b|[OpenGVLab/InternVL2-Llama3-76B](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B)| -|internvl2-2b-awq|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)| -|internvl2-8b-awq|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)| -|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| -|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| -|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| +|internvl2-2b-awq|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)| +|internvl2-8b-awq|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)| +|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| +|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| +|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|paligemma|✔|✔|✘|✘|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)| diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index 4a5b53facf..eb8f97a7ca 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -109,51 +109,6 @@ The table below introcudes all models supported by SWIFT: |qwen2-math-7b-instruct|[qwen/Qwen2-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-7B-Instruct)| |qwen2-math-72b|[qwen/Qwen2-Math-72B](https://modelscope.cn/models/qwen/Qwen2-Math-72B/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-72B](https://huggingface.co/Qwen/Qwen2-Math-72B)| |qwen2-math-72b-instruct|[qwen/Qwen2-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-72B-Instruct)| -|qwen2_5-0_5b|[qwen/Qwen2.5-0.5B](https://modelscope.cn/models/qwen/Qwen2.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B)| -|qwen2_5-1_5b|[qwen/Qwen2.5-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B)| -|qwen2_5-3b|[qwen/Qwen2.5-3B](https://modelscope.cn/models/qwen/Qwen2.5-3B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B)| -|qwen2_5-7b|[qwen/Qwen2.5-7B](https://modelscope.cn/models/qwen/Qwen2.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B)| -|qwen2_5-14b|[qwen/Qwen2.5-14B](https://modelscope.cn/models/qwen/Qwen2.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B)| -|qwen2_5-32b|[qwen/Qwen2.5-32B](https://modelscope.cn/models/qwen/Qwen2.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B)| -|qwen2_5-72b|[qwen/Qwen2.5-72B](https://modelscope.cn/models/qwen/Qwen2.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B](https://huggingface.co/Qwen/Qwen2.5-72B)| -|qwen2_5-0_5b-instruct|[qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)| -|qwen2_5-1_5b-instruct|[qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)| -|qwen2_5-3b-instruct|[qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)| -|qwen2_5-7b-instruct|[qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)| -|qwen2_5-14b-instruct|[qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)| -|qwen2_5-32b-instruct|[qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)| -|qwen2_5-72b-instruct|[qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)| -|qwen2_5-0_5b-instruct-gptq-int4|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)| -|qwen2_5-1_5b-instruct-gptq-int4|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)| -|qwen2_5-3b-instruct-gptq-int4|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)| -|qwen2_5-7b-instruct-gptq-int4|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)| -|qwen2_5-14b-instruct-gptq-int4|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)| -|qwen2_5-32b-instruct-gptq-int4|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)| -|qwen2_5-72b-instruct-gptq-int4|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)| -|qwen2_5-0_5b-instruct-gptq-int8|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)| -|qwen2_5-1_5b-instruct-gptq-int8|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)| -|qwen2_5-3b-instruct-gptq-int8|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)| -|qwen2_5-7b-instruct-gptq-int8|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)| -|qwen2_5-14b-instruct-gptq-int8|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)| -|qwen2_5-32b-instruct-gptq-int8|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)| -|qwen2_5-72b-instruct-gptq-int8|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)| -|qwen2_5-0_5b-instruct-awq|[qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)| -|qwen2_5-1_5b-instruct-awq|[qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)| -|qwen2_5-3b-instruct-awq|[qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)| -|qwen2_5-7b-instruct-awq|[qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)| -|qwen2_5-14b-instruct-awq|[qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)| -|qwen2_5-32b-instruct-awq|[qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)| -|qwen2_5-72b-instruct-awq|[qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)| -|qwen2_5-math-1_5b|[qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)| -|qwen2_5-math-7b|[qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)| -|qwen2_5-math-72b|[qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)| -|qwen2_5-math-1_5b-instruct|[qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)| -|qwen2_5-math-7b-instruct|[qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)| -|qwen2_5-math-72b-instruct|[qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)| -|qwen2_5-coder-1_5b|[qwen/Qwen2.5-Coder-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B)| -|qwen2_5-coder-1_5b-instruct|[qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)| -|qwen2_5-coder-7b|[qwen/Qwen2.5-Coder-7B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B](https://huggingface.co/Qwen/Qwen2.5-Coder-7B)| -|qwen2_5-coder-7b-instruct|[qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)| |chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)| |chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)| |chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)| @@ -391,7 +346,7 @@ The table below introcudes all models supported by SWIFT: |phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|query_key_value|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)| |phi3-medium-128k-instruct|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct/summary)|qkv_proj|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)| |phi3_5-mini-instruct|[LLM-Research/Phi-3.5-mini-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-mini-instruct/summary)|qkv_proj|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3.5-mini-instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct)| -|phi3_5-moe-instruct|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct/summary)|q_proj, k_proj, v_proj|phi3|✔|✔|✘|✘|transformers>=4.36|moe|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)| +|phi3_5-moe-instruct|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct/summary)|q_proj, k_proj, v_proj|phi3|✔|✘|✘|✘|transformers>=4.36|moe|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)| |mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)| |mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)| |mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)| @@ -414,28 +369,21 @@ The table below introcudes all models supported by SWIFT: ### MLLM | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support vLLM | Support LMDeploy | Support Megatron | Requires | Tags | HF Model ID | | --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | ---------------- | ---------------- | -------- | ---- | ----------- | -|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl-generation|✔|✔|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| -|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| -|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| +|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl-generation|✔|✘|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| +|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✘|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| +|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✘|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| -|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| -|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| -|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| -|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| -|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| -|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| -|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| -|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| -|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| -|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| -|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| -|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| -|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| -|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| -|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| +|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| +|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| +|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| +|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| +|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| +|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| +|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| +|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|✘|✘|✘|✘|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| |idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)| |llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|✔|✔|✘|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| @@ -475,11 +423,11 @@ The table below introcudes all models supported by SWIFT: |internvl2-26b|[OpenGVLab/InternVL2-26B](https://modelscope.cn/models/OpenGVLab/InternVL2-26B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)| |internvl2-40b|[OpenGVLab/InternVL2-40B](https://modelscope.cn/models/OpenGVLab/InternVL2-40B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B)| |internvl2-llama3-76b|[OpenGVLab/InternVL2-Llama3-76B](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B)| -|internvl2-2b-awq|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)| -|internvl2-8b-awq|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)| -|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| -|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| -|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| +|internvl2-2b-awq|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)| +|internvl2-8b-awq|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)| +|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| +|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| +|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|paligemma|✔|✔|✘|✘|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)| From 9a61fad16cfa7de88f16400cd0fea52fb6b4a569 Mon Sep 17 00:00:00 2001 From: DaozeZhang Date: Fri, 20 Sep 2024 10:13:45 +0800 Subject: [PATCH 6/7] remove transformers in gitignore (to PR this branch) --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index d6764baebd..084ee30b84 100644 --- a/.gitignore +++ b/.gitignore @@ -131,7 +131,6 @@ output/ *.out benchmarks/ eval_outputs/ -transformers/ # Pytorch *.pth From eac32a11da8db17e8126c66b92f93b343edc24da Mon Sep 17 00:00:00 2001 From: DaozeZhang Date: Fri, 20 Sep 2024 10:28:10 +0800 Subject: [PATCH 7/7] update doc --- ...14\346\225\260\346\215\256\351\233\206.md" | 87 +++++++++++++++---- .../Instruction/Supported-models-datasets.md | 87 +++++++++++++++---- 2 files changed, 140 insertions(+), 34 deletions(-) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index bfbae085da..e7c37f6e6b 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -109,6 +109,51 @@ |qwen2-math-7b-instruct|[qwen/Qwen2-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-7B-Instruct)| |qwen2-math-72b|[qwen/Qwen2-Math-72B](https://modelscope.cn/models/qwen/Qwen2-Math-72B/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-72B](https://huggingface.co/Qwen/Qwen2-Math-72B)| |qwen2-math-72b-instruct|[qwen/Qwen2-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-72B-Instruct)| +|qwen2_5-0_5b|[qwen/Qwen2.5-0.5B](https://modelscope.cn/models/qwen/Qwen2.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B)| +|qwen2_5-1_5b|[qwen/Qwen2.5-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B)| +|qwen2_5-3b|[qwen/Qwen2.5-3B](https://modelscope.cn/models/qwen/Qwen2.5-3B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B)| +|qwen2_5-7b|[qwen/Qwen2.5-7B](https://modelscope.cn/models/qwen/Qwen2.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B)| +|qwen2_5-14b|[qwen/Qwen2.5-14B](https://modelscope.cn/models/qwen/Qwen2.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B)| +|qwen2_5-32b|[qwen/Qwen2.5-32B](https://modelscope.cn/models/qwen/Qwen2.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B)| +|qwen2_5-72b|[qwen/Qwen2.5-72B](https://modelscope.cn/models/qwen/Qwen2.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B](https://huggingface.co/Qwen/Qwen2.5-72B)| +|qwen2_5-0_5b-instruct|[qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)| +|qwen2_5-1_5b-instruct|[qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)| +|qwen2_5-3b-instruct|[qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)| +|qwen2_5-7b-instruct|[qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)| +|qwen2_5-14b-instruct|[qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)| +|qwen2_5-32b-instruct|[qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)| +|qwen2_5-72b-instruct|[qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)| +|qwen2_5-0_5b-instruct-gptq-int4|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)| +|qwen2_5-1_5b-instruct-gptq-int4|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)| +|qwen2_5-3b-instruct-gptq-int4|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)| +|qwen2_5-7b-instruct-gptq-int4|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)| +|qwen2_5-14b-instruct-gptq-int4|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)| +|qwen2_5-32b-instruct-gptq-int4|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)| +|qwen2_5-72b-instruct-gptq-int4|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)| +|qwen2_5-0_5b-instruct-gptq-int8|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)| +|qwen2_5-1_5b-instruct-gptq-int8|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)| +|qwen2_5-3b-instruct-gptq-int8|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)| +|qwen2_5-7b-instruct-gptq-int8|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)| +|qwen2_5-14b-instruct-gptq-int8|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)| +|qwen2_5-32b-instruct-gptq-int8|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)| +|qwen2_5-72b-instruct-gptq-int8|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)| +|qwen2_5-0_5b-instruct-awq|[qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)| +|qwen2_5-1_5b-instruct-awq|[qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)| +|qwen2_5-3b-instruct-awq|[qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)| +|qwen2_5-7b-instruct-awq|[qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)| +|qwen2_5-14b-instruct-awq|[qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)| +|qwen2_5-32b-instruct-awq|[qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)| +|qwen2_5-72b-instruct-awq|[qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)| +|qwen2_5-math-1_5b|[qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)| +|qwen2_5-math-7b|[qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)| +|qwen2_5-math-72b|[qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)| +|qwen2_5-math-1_5b-instruct|[qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)| +|qwen2_5-math-7b-instruct|[qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)| +|qwen2_5-math-72b-instruct|[qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)| +|qwen2_5-coder-1_5b|[qwen/Qwen2.5-Coder-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B)| +|qwen2_5-coder-1_5b-instruct|[qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)| +|qwen2_5-coder-7b|[qwen/Qwen2.5-Coder-7B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B](https://huggingface.co/Qwen/Qwen2.5-Coder-7B)| +|qwen2_5-coder-7b-instruct|[qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)| |chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)| |chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)| |chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)| @@ -289,6 +334,7 @@ |mistral-nemo-base-2407|[AI-ModelScope/Mistral-Nemo-Base-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Base-2407/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Nemo-Base-2407](https://huggingface.co/mistralai/Mistral-Nemo-Base-2407)| |mistral-nemo-instruct-2407|[AI-ModelScope/Mistral-Nemo-Instruct-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Instruct-2407/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)| |mistral-large-instruct-2407|[LLM-Research/Mistral-Large-Instruct-2407](https://modelscope.cn/models/LLM-Research/Mistral-Large-Instruct-2407/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| +|mistral-small-instruct-2409|[AI-ModelScope/Mistral-Small-Instruct-2409](https://modelscope.cn/models/AI-ModelScope/Mistral-Small-Instruct-2409/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Small-Instruct-2409](https://huggingface.co/mistralai/Mistral-Small-Instruct-2409)| |mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✘|✘|transformers>=4.36|moe|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)| |mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|✘|✘|transformers>=4.36|moe|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| |mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation|✔|✘|✘|✘|transformers>=4.38, aqlm, torch>=2.2.0|moe|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)| @@ -346,7 +392,7 @@ |phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|query_key_value|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)| |phi3-medium-128k-instruct|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct/summary)|qkv_proj|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)| |phi3_5-mini-instruct|[LLM-Research/Phi-3.5-mini-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-mini-instruct/summary)|qkv_proj|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3.5-mini-instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct)| -|phi3_5-moe-instruct|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct/summary)|q_proj, k_proj, v_proj|phi3|✔|✘|✘|✘|transformers>=4.36|moe|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)| +|phi3_5-moe-instruct|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct/summary)|q_proj, k_proj, v_proj|phi3|✔|✔|✘|✘|transformers>=4.36|moe|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)| |mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)| |mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)| |mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)| @@ -369,21 +415,28 @@ ### 多模态大模型 | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support vLLM | Support LMDeploy | Support Megatron | Requires | Tags | HF Model ID | | --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | ---------------- | ---------------- | -------- | ---- | ----------- | -|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl-generation|✔|✘|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| -|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✘|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| -|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✘|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| +|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl-generation|✔|✔|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| +|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| +|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| -|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| -|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| -|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| -|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| -|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| -|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| -|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| -|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| +|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| +|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| +|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| +|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| +|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| +|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| +|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| +|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| +|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| +|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| +|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| +|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| +|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| +|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| +|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|✘|✘|✘|✘|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| |idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)| |llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|✔|✔|✘|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| @@ -423,11 +476,11 @@ |internvl2-26b|[OpenGVLab/InternVL2-26B](https://modelscope.cn/models/OpenGVLab/InternVL2-26B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)| |internvl2-40b|[OpenGVLab/InternVL2-40B](https://modelscope.cn/models/OpenGVLab/InternVL2-40B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B)| |internvl2-llama3-76b|[OpenGVLab/InternVL2-Llama3-76B](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B)| -|internvl2-2b-awq|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)| -|internvl2-8b-awq|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)| -|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| -|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| -|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| +|internvl2-2b-awq|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)| +|internvl2-8b-awq|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)| +|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| +|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| +|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|paligemma|✔|✔|✘|✘|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)| diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index eb8f97a7ca..e499219f98 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -109,6 +109,51 @@ The table below introcudes all models supported by SWIFT: |qwen2-math-7b-instruct|[qwen/Qwen2-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-7B-Instruct)| |qwen2-math-72b|[qwen/Qwen2-Math-72B](https://modelscope.cn/models/qwen/Qwen2-Math-72B/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-72B](https://huggingface.co/Qwen/Qwen2-Math-72B)| |qwen2-math-72b-instruct|[qwen/Qwen2-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✔|transformers>=4.37|-|[Qwen/Qwen2-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2-Math-72B-Instruct)| +|qwen2_5-0_5b|[qwen/Qwen2.5-0.5B](https://modelscope.cn/models/qwen/Qwen2.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B](https://huggingface.co/Qwen/Qwen2.5-0.5B)| +|qwen2_5-1_5b|[qwen/Qwen2.5-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B)| +|qwen2_5-3b|[qwen/Qwen2.5-3B](https://modelscope.cn/models/qwen/Qwen2.5-3B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B)| +|qwen2_5-7b|[qwen/Qwen2.5-7B](https://modelscope.cn/models/qwen/Qwen2.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B)| +|qwen2_5-14b|[qwen/Qwen2.5-14B](https://modelscope.cn/models/qwen/Qwen2.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B)| +|qwen2_5-32b|[qwen/Qwen2.5-32B](https://modelscope.cn/models/qwen/Qwen2.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B)| +|qwen2_5-72b|[qwen/Qwen2.5-72B](https://modelscope.cn/models/qwen/Qwen2.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B](https://huggingface.co/Qwen/Qwen2.5-72B)| +|qwen2_5-0_5b-instruct|[qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)| +|qwen2_5-1_5b-instruct|[qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)| +|qwen2_5-3b-instruct|[qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)| +|qwen2_5-7b-instruct|[qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)| +|qwen2_5-14b-instruct|[qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)| +|qwen2_5-32b-instruct|[qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)| +|qwen2_5-72b-instruct|[qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)| +|qwen2_5-0_5b-instruct-gptq-int4|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)| +|qwen2_5-1_5b-instruct-gptq-int4|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)| +|qwen2_5-3b-instruct-gptq-int4|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)| +|qwen2_5-7b-instruct-gptq-int4|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)| +|qwen2_5-14b-instruct-gptq-int4|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)| +|qwen2_5-32b-instruct-gptq-int4|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)| +|qwen2_5-72b-instruct-gptq-int4|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)| +|qwen2_5-0_5b-instruct-gptq-int8|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)| +|qwen2_5-1_5b-instruct-gptq-int8|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)| +|qwen2_5-3b-instruct-gptq-int8|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)| +|qwen2_5-7b-instruct-gptq-int8|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)| +|qwen2_5-14b-instruct-gptq-int8|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)| +|qwen2_5-32b-instruct-gptq-int8|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)| +|qwen2_5-72b-instruct-gptq-int8|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)| +|qwen2_5-0_5b-instruct-awq|[qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)| +|qwen2_5-1_5b-instruct-awq|[qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)| +|qwen2_5-3b-instruct-awq|[qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)| +|qwen2_5-7b-instruct-awq|[qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)| +|qwen2_5-14b-instruct-awq|[qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)| +|qwen2_5-32b-instruct-awq|[qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)| +|qwen2_5-72b-instruct-awq|[qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)| +|qwen2_5-math-1_5b|[qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)| +|qwen2_5-math-7b|[qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)| +|qwen2_5-math-72b|[qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)| +|qwen2_5-math-1_5b-instruct|[qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)| +|qwen2_5-math-7b-instruct|[qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)| +|qwen2_5-math-72b-instruct|[qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)| +|qwen2_5-coder-1_5b|[qwen/Qwen2.5-Coder-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B)| +|qwen2_5-coder-1_5b-instruct|[qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)| +|qwen2_5-coder-7b|[qwen/Qwen2.5-Coder-7B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B](https://huggingface.co/Qwen/Qwen2.5-Coder-7B)| +|qwen2_5-coder-7b-instruct|[qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)| |chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)| |chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)| |chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)| @@ -289,6 +334,7 @@ The table below introcudes all models supported by SWIFT: |mistral-nemo-base-2407|[AI-ModelScope/Mistral-Nemo-Base-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Base-2407/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Nemo-Base-2407](https://huggingface.co/mistralai/Mistral-Nemo-Base-2407)| |mistral-nemo-instruct-2407|[AI-ModelScope/Mistral-Nemo-Instruct-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Instruct-2407/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)| |mistral-large-instruct-2407|[LLM-Research/Mistral-Large-Instruct-2407](https://modelscope.cn/models/LLM-Research/Mistral-Large-Instruct-2407/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| +|mistral-small-instruct-2409|[AI-ModelScope/Mistral-Small-Instruct-2409](https://modelscope.cn/models/AI-ModelScope/Mistral-Small-Instruct-2409/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Small-Instruct-2409](https://huggingface.co/mistralai/Mistral-Small-Instruct-2409)| |mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✘|✘|transformers>=4.36|moe|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)| |mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|✘|✘|transformers>=4.36|moe|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| |mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation|✔|✘|✘|✘|transformers>=4.38, aqlm, torch>=2.2.0|moe|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)| @@ -346,7 +392,7 @@ The table below introcudes all models supported by SWIFT: |phi3-small-128k-instruct|[LLM-Research/Phi-3-small-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-small-128k-instruct/summary)|query_key_value|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct)| |phi3-medium-128k-instruct|[LLM-Research/Phi-3-medium-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-medium-128k-instruct/summary)|qkv_proj|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)| |phi3_5-mini-instruct|[LLM-Research/Phi-3.5-mini-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-mini-instruct/summary)|qkv_proj|phi3|✔|✔|✘|✘|transformers>=4.36|-|[microsoft/Phi-3.5-mini-instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct)| -|phi3_5-moe-instruct|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct/summary)|q_proj, k_proj, v_proj|phi3|✔|✘|✘|✘|transformers>=4.36|moe|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)| +|phi3_5-moe-instruct|[LLM-Research/Phi-3.5-MoE-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-MoE-instruct/summary)|q_proj, k_proj, v_proj|phi3|✔|✔|✘|✘|transformers>=4.36|moe|[microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)| |mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)| |mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)| |mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)| @@ -369,21 +415,28 @@ The table below introcudes all models supported by SWIFT: ### MLLM | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support vLLM | Support LMDeploy | Support Megatron | Requires | Tags | HF Model ID | | --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | ---------------- | ---------------- | -------- | ---- | ----------- | -|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl-generation|✔|✘|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| -|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✘|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| -|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✘|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| +|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl-generation|✔|✔|✔|✘||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| +|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✔|✘||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| +|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| |qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| |qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| -|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| -|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| -|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| -|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| -|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| -|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| -|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| -|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✘|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| +|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)| +|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)| +|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)| +|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)| +|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)| +|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)| +|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)| +|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)| +|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)| +|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)| +|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)| +|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)| +|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)| +|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)| +|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)| |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|✘|✘|✘|✘|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)| |idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)| |llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|✔|✔|✘|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| @@ -423,11 +476,11 @@ The table below introcudes all models supported by SWIFT: |internvl2-26b|[OpenGVLab/InternVL2-26B](https://modelscope.cn/models/OpenGVLab/InternVL2-26B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)| |internvl2-40b|[OpenGVLab/InternVL2-40B](https://modelscope.cn/models/OpenGVLab/InternVL2-40B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B)| |internvl2-llama3-76b|[OpenGVLab/InternVL2-Llama3-76B](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B)| -|internvl2-2b-awq|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)| -|internvl2-8b-awq|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)| -|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| -|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| -|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✘|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| +|internvl2-2b-awq|[OpenGVLab/InternVL2-2B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-2B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)| +|internvl2-8b-awq|[OpenGVLab/InternVL2-8B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-8B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-8B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-8B-AWQ)| +|internvl2-26b-awq|[OpenGVLab/InternVL2-26B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-26B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-26B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-26B-AWQ)| +|internvl2-40b-awq|[OpenGVLab/InternVL2-40B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-40B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-40B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-40B-AWQ)| +|internvl2-llama3-76b-awq|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://modelscope.cn/models/OpenGVLab/InternVL2-Llama3-76B-AWQ/summary)|^(language_model\|mlp1)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|internvl2|✔|✔|✔|✘|transformers>=4.36, timm|vision, video|[OpenGVLab/InternVL2-Llama3-76B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B-AWQ)| |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|^(language_model\|aligner)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|deepseek-vl|✔|✘|✔|✘||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|paligemma|✔|✔|✘|✘|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)|