From fb974069eaaadf23a60bc58f480744e728c50d78 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 13:39:47 +0800
Subject: [PATCH 01/26] update use_hf

---
 ...271\211\344\270\216\346\213\223\345\261\225.md" |  3 +--
 docs/source/Multi-Modal/index.md                   | 14 +++++++-------
 docs/source_en/LLM/Customization.md                |  3 +--
 swift/llm/dpo.py                                   |  3 +--
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md"
index fac24df81f..6815159727 100644
--- "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md"
+++ "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md"
@@ -257,8 +257,7 @@ if __name__ == '__main__':
 - `get_function`: 默认值为`None`. 获取model和tokenizer的函数. 如果传入None, 则使用修饰器方案进行模型注册. 如果传入一个函数, 则使用正常方案进行注册.
 - `requires`: 默认为`[]`. 表示模型所需要的区别于其他模型的依赖. 该参数一般不需要设置.
 - `torch_dtype`: 默认为`None`. 表示模型所推荐使用的torch_dtype. 该参数一般不需要设置.
-- `use_hf`: 默认为`False`, 即设置为modelscope hub. 如果你要使用huggingface hub, 你可以设置为True.
-- `revision`: 默认为`None`. 用于指定模型的版本号, 如果`use_hf`为False, 则设置为'master', 如果`use_hf`为True, 则设置为'main'. 如果`model_id_or_path`是本地的模型目录, 则该参数失效. 该参数一般不需要设置.
+- `revision`: 默认为`None`. 用于指定模型的版本号. 如果`model_id_or_path`是本地的模型目录, 则该参数失效. 该参数一般不需要设置.
 - `ignore_file_pattern`: 默认为`None`. 表示下载的时候需要忽略的文件名的正则pattern, 该参数会传递给`snapshot_download`. 例如`r'.+\.bin$'`, `r'.+\.savetensors$'`等. 该参数一般不需要设置.
 - `**kwargs`: 其他用于注释模型能力的参数. 该参数一般不需要设置.
 
diff --git a/docs/source/Multi-Modal/index.md b/docs/source/Multi-Modal/index.md
index fecbb3c493..50475ace57 100644
--- a/docs/source/Multi-Modal/index.md
+++ b/docs/source/Multi-Modal/index.md
@@ -3,11 +3,11 @@
 ### Multi-Modal最佳实践系列
 
 1. [Qwen-VL最佳实践](qwen-vl最佳实践.md)
-2. [Qwen-Audio最佳实践](qwen-auidio最佳实践.md)
+2. [Qwen-Audio最佳实践](qwen-audio最佳实践.md)
 3. [Llava最佳实践](llava最佳实践.md)
-4. [Deepseek-VL最佳实践](../Multi-Modal/deepseek-vl最佳实践.md)
-5. [Yi-VL最佳实践.md](../Multi-Modal/yi-vl最佳实践.md)
-6. [Internlm2-Xcomposers最佳实践](../Multi-Modal/internlm-xcomposer2最佳实践.md)
-7. [MiniCPM-V最佳实践](../Multi-Modal/minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](../Multi-Modal/minicpm-v-2最佳实践.md)
-8. [CogVLM最佳实践](../Multi-Modal/cogvlm最佳实践.md)
-9. [mPLUG-Owl2最佳实践](../Multi-Modal/mplug-owl2最佳实践.md)
+4. [Deepseek-VL最佳实践](deepseek-vl最佳实践.md)
+5. [Yi-VL最佳实践.md](yi-vl最佳实践.md)
+6. [Internlm2-Xcomposers最佳实践](internlm-xcomposer2最佳实践.md)
+7. [MiniCPM-V最佳实践](minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](minicpm-v-2最佳实践.md)
+8. [CogVLM最佳实践](cogvlm最佳实践.md)
+9. [mPLUG-Owl2最佳实践](mplug-owl2最佳实践.md)
diff --git a/docs/source_en/LLM/Customization.md b/docs/source_en/LLM/Customization.md
index 59e7d1939f..1cdf6a8ccd 100644
--- a/docs/source_en/LLM/Customization.md
+++ b/docs/source_en/LLM/Customization.md
@@ -256,8 +256,7 @@ if __name__ == '__main__':
 - `get_function`: Default value is `None`. The function to get model and tokenizer. If passed `None`, the decorator approach will be used to register the model. If passed a function, the normal approach will be used to register.
 - `requires`: Default is `[]`. Represents the dependencies required by the model that differ from other models. This parameter generally does not need to be set.
 - `torch_dtype`: Default is `None`. Represents the recommended torch_dtype for the model to use. This parameter generally does not need to be set.
-- `use_hf`: Default is `False`, i.e. set to modelscope hub. If you want to use huggingface hub, you can set it to True.
-- `revision`: Default is `None`. Used to specify the version number of the model. If `use_hf` is False, it is set to 'master', if `use_hf` is True, it is set to 'main'. If `model_id_or_path` is a local model directory, this parameter is not effective. This parameter generally does not need to be set.
+- `revision`: Default is `None`. Used to specify the version number of the model. If `model_id_or_path` is a local model directory, this parameter is not effective. This parameter generally does not need to be set.
 - `ignore_file_pattern`: Default is `None`. Represents the regular pattern of file names to be ignored when downloading, this parameter will be passed to `snapshot_download`. For example, `r'.+\.bin$'`, `r'.+\.savetensors$'`, etc. This parameter generally does not need to be set.
 - `**kwargs`: Other parameters used to annotate model capabilities. This parameter generally does not need to be set.
 
diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py
index a43a46a724..19d6d9b543 100644
--- a/swift/llm/dpo.py
+++ b/swift/llm/dpo.py
@@ -12,10 +12,9 @@
 from swift.utils import (check_json_format, get_dist_setting, get_logger,
                          get_main, get_model_info, is_ddp_plus_mp, is_dist,
                          is_master, plot_images, seed_everything, show_layers)
-from . import get_time_info
 from .tuner import prepare_model
 from .utils import (DPOArguments, Template, get_dataset, get_model_tokenizer,
-                    get_template, set_generation_config)
+                    get_template, set_generation_config, get_time_info)
 
 logger = get_logger()
 

From 1432b32c1acb8bb226798c8c9a6645375ff1dba5 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 15:47:36 +0800
Subject: [PATCH 02/26] update use_hf

---
 swift/llm/dpo.py         |   2 +-
 swift/llm/utils/model.py | 798 +++++++++++++++++++++++++--------------
 2 files changed, 524 insertions(+), 276 deletions(-)

diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py
index 19d6d9b543..58e6de18cc 100644
--- a/swift/llm/dpo.py
+++ b/swift/llm/dpo.py
@@ -14,7 +14,7 @@
                          is_master, plot_images, seed_everything, show_layers)
 from .tuner import prepare_model
 from .utils import (DPOArguments, Template, get_dataset, get_model_tokenizer,
-                    get_template, set_generation_config, get_time_info)
+                    get_template, get_time_info, set_generation_config)
 
 logger = get_logger()
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 0b5c2cb9a4..2cf09e95c9 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -23,6 +23,7 @@
                           PreTrainedTokenizerBase)
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 from transformers.models.auto.tokenization_auto import get_tokenizer_config
+from transformers.utils import strtobool
 from transformers.utils.versions import require_version
 
 from swift import get_logger
@@ -95,6 +96,7 @@ class ModelType:
     qwen1half_4b_chat_awq = 'qwen1half-4b-chat-awq'
     qwen1half_7b_chat_awq = 'qwen1half-7b-chat-awq'
     qwen1half_14b_chat_awq = 'qwen1half-14b-chat-awq'
+    qwen1half_32b_chat_awq = 'qwen1half-32b-chat-awq'
     qwen1half_72b_chat_awq = 'qwen1half-72b-chat-awq'
 
     # qwen-vl
@@ -127,6 +129,7 @@ class ModelType:
     yi_6b_200k = 'yi-6b-200k'
     yi_6b_chat = 'yi-6b-chat'
     yi_9b = 'yi-9b'
+    yi_9b_200k = 'yi-9b-200k'
     yi_34b = 'yi-34b'
     yi_34b_200k = 'yi-34b-200k'
     yi_34b_chat = 'yi-34b-chat'
@@ -355,7 +358,7 @@ def register_model(
     *,
     requires: Optional[List[str]] = None,
     torch_dtype: Optional[Dtype] = None,
-    use_hf: bool = False,
+    hf_model_id: Optional[str] = None,
     revision: Optional[str] = None,
     ignore_file_pattern: Optional[List[str]] = None,
     function_kwargs: Optional[Dict[str, Any]] = None,
@@ -373,7 +376,7 @@ def register_model(
     if function_kwargs is None:
         function_kwargs = {}
     if revision is None:
-        revision = 'main' if use_hf else 'master'
+        revision = 'master'
     model_info = {
         'model_id_or_path': model_id_or_path,
         'lora_target_modules': lora_target_modules,
@@ -381,7 +384,7 @@ def register_model(
         'requires': requires,
         'torch_dtype': torch_dtype,
         'ignore_file_pattern': ignore_file_pattern,
-        'use_hf': use_hf,
+        'hf_model_id': hf_model_id,
         'revision': revision,
         'eos_token': eos_token,
         **kwargs
@@ -412,70 +415,125 @@ def _register_model(
     'Shanghai_AI_Laboratory/internlm-20b',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-20b')
 @register_model(
     ModelType.internlm_7b,
     'Shanghai_AI_Laboratory/internlm-7b',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
-    support_vllm=True)
-@register_model(ModelType.bluelm_7b_chat_32k, 'vivo-ai/BlueLM-7B-Chat-32K',
-                LoRATM.llama2, TemplateType.bluelm)
-@register_model(ModelType.bluelm_7b_chat, 'vivo-ai/BlueLM-7B-Chat',
-                LoRATM.llama2, TemplateType.bluelm)
-@register_model(ModelType.bluelm_7b_32k, 'vivo-ai/BlueLM-7B-Base-32K',
-                LoRATM.llama2, TemplateType.default_generation_bos)
-@register_model(ModelType.bluelm_7b, 'vivo-ai/BlueLM-7B-Base', LoRATM.llama2,
-                TemplateType.default_generation_bos)
+    support_vllm=True,
+    hf_model_id='internlm/internlm-7b')
+@register_model(
+    ModelType.bluelm_7b_chat_32k,
+    'vivo-ai/BlueLM-7B-Chat-32K',
+    LoRATM.llama2,
+    TemplateType.bluelm,
+    hf_model_id='vivo-ai/BlueLM-7B-Chat-32K')
+@register_model(
+    ModelType.bluelm_7b_chat,
+    'vivo-ai/BlueLM-7B-Chat',
+    LoRATM.llama2,
+    TemplateType.bluelm,
+    hf_model_id='vivo-ai/BlueLM-7B-Chat')
+@register_model(
+    ModelType.bluelm_7b_32k,
+    'vivo-ai/BlueLM-7B-Base-32K',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    hf_model_id='vivo-ai/BlueLM-7B-Base-32K')
+@register_model(
+    ModelType.bluelm_7b,
+    'vivo-ai/BlueLM-7B-Base',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    hf_model_id='vivo-ai/BlueLM-7B-Base')
 @register_model(
     ModelType.seqgpt_560m,
     'damo/nlp_seqgpt-560m',
     LoRATM.bloom,
     TemplateType.default_generation,
-    support_vllm=True)
-@register_model(ModelType.xverse_13b_chat, 'xverse/XVERSE-13B-Chat',
-                LoRATM.llama2, TemplateType.xverse)
-@register_model(ModelType.xverse_13b, 'xverse/XVERSE-13B', LoRATM.llama2,
-                TemplateType.default_generation)
-@register_model(ModelType.xverse_65b, 'xverse/XVERSE-65B', LoRATM.llama2,
-                TemplateType.default_generation)
-@register_model(ModelType.xverse_65b_v2, 'xverse/XVERSE-65B-2', LoRATM.llama2,
-                TemplateType.default_generation)
-@register_model(ModelType.xverse_65b_chat, 'xverse/XVERSE-65B-Chat',
-                LoRATM.llama2, TemplateType.xverse)
+    support_vllm=True,
+    hf_model_id='DAMO-NLP/SeqGPT-560M')
+@register_model(
+    ModelType.xverse_13b_chat,
+    'xverse/XVERSE-13B-Chat',
+    LoRATM.llama2,
+    TemplateType.xverse,
+    hf_model_id='xverse/XVERSE-13B-Chat')
+@register_model(
+    ModelType.xverse_13b,
+    'xverse/XVERSE-13B',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    hf_model_id='xverse/XVERSE-13B')
+@register_model(
+    ModelType.xverse_65b,
+    'xverse/XVERSE-65B',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    hf_model_id='xverse/XVERSE-65B')
+@register_model(
+    ModelType.xverse_65b_v2,
+    'xverse/XVERSE-65B-2',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    hf_model_id='xverse/XVERSE-65B-2')
+@register_model(
+    ModelType.xverse_65b_chat,
+    'xverse/XVERSE-65B-Chat',
+    LoRATM.llama2,
+    TemplateType.xverse,
+    hf_model_id='xverse/XVERSE-65B-Chat')
 @register_model(
     ModelType.xverse_13b_256k,
     'xverse/XVERSE-13B-256K',
     LoRATM.llama2,
     TemplateType.default_generation,
-    revision='v1.0.0')
-@register_model(ModelType.xverse_7b_chat, 'xverse/XVERSE-7B-Chat',
-                LoRATM.llama2, TemplateType.xverse)
-@register_model(ModelType.xverse_7b, 'xverse/XVERSE-7B', LoRATM.llama2,
-                TemplateType.default_generation)
-@register_model(ModelType.xverse_moe_a4_2b, 'xverse/XVERSE-MoE-A4.2B',
-                LoRATM.llama2, TemplateType.default_generation)
+    revision='v1.0.0',
+    hf_model_id='xverse/XVERSE-13B-256K')
+@register_model(
+    ModelType.xverse_7b_chat,
+    'xverse/XVERSE-7B-Chat',
+    LoRATM.llama2,
+    TemplateType.xverse,
+    hf_model_id='xverse/XVERSE-7B-Chat')
+@register_model(
+    ModelType.xverse_7b,
+    'xverse/XVERSE-7B',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    hf_model_id='xverse/XVERSE-7B')
+@register_model(
+    ModelType.xverse_moe_a4_2b,
+    'xverse/XVERSE-MoE-A4.2B',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    hf_model_id='xverse/XVERSE-MoE-A4.2B')
 @register_model(
     ModelType.baichuan_13b_chat,
     'baichuan-inc/Baichuan-13B-Chat',
     LoRATM.baichuan,
     TemplateType.baichuan,
     requires=['transformers<4.34'],
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='baichuan-inc/Baichuan-13B-Chat')
 @register_model(
     ModelType.baichuan_7b,
     'baichuan-inc/baichuan-7B',
     LoRATM.baichuan,
     TemplateType.default_generation,
     requires=['transformers<4.34'],
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='baichuan-inc/Baichuan-7B')
 @register_model(
     ModelType.mengzi3_13b_base,
     'langboat/Mengzi3-13B-Base',
     LoRATM.llama2,
     TemplateType.mengzi,
     support_vllm=True,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Langboat/Mengzi3-13B-Base')
 @register_model(
     ModelType.c4ai_command_r_v01,
     'AI-ModelScope/c4ai-command-r-v01',
@@ -483,7 +541,8 @@ def _register_model(
     TemplateType.c4ai,
     requires=['transformers>=4.39.1'],
     support_vllm=False,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='CohereForAI/c4ai-command-r-v01')
 @register_model(
     ModelType.c4ai_command_r_plus,
     'AI-ModelScope/c4ai-command-r-plus',
@@ -491,7 +550,8 @@ def _register_model(
     TemplateType.c4ai,
     requires=['transformers>4.39'],
     support_vllm=False,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='CohereForAI/c4ai-command-r-plus')
 def get_model_tokenizer_from_repo(model_dir: str,
                                   torch_dtype: Optional[Dtype],
                                   model_kwargs: Dict[str, Any],
@@ -531,7 +591,8 @@ def get_model_tokenizer_from_repo(model_dir: str,
     LoRATM.grok_1,
     TemplateType.default_generation,
     support_vllm=False,
-    support_flash_attn=False)
+    support_flash_attn=False,
+    hf_model_id='hpcai-tech/grok-1')
 def get_model_tokenizer_grok(model_dir: str,
                              torch_dtype: Optional[Dtype],
                              model_kwargs: Dict[str, Any],
@@ -568,42 +629,48 @@ def get_model_tokenizer_grok(model_dir: str,
     LoRATM.mamba,
     TemplateType.default_generation,
     requires=['transformers>=4.39.0'],
-    support_vllm=False)
+    support_vllm=False,
+    hf_model_id='state-spaces/mamba-130m-hf')
 @register_model(
     ModelType.mamba_370m,
     'AI-ModelScope/mamba-370m-hf',
     LoRATM.mamba,
     TemplateType.default_generation,
     requires=['transformers>=4.39.0'],
-    support_vllm=False)
+    support_vllm=False,
+    hf_model_id='state-spaces/mamba-370m-hf')
 @register_model(
     ModelType.mamba_390m,
     'AI-ModelScope/mamba-390m-hf',
     LoRATM.mamba,
     TemplateType.default_generation,
     requires=['transformers>=4.39.0'],
-    support_vllm=False)
+    support_vllm=False,
+    hf_model_id='state-spaces/mamba-390m-hf')
 @register_model(
     ModelType.mamba_790m,
     'AI-ModelScope/mamba-790m-hf',
     LoRATM.mamba,
     TemplateType.default_generation,
     requires=['transformers>=4.39.0'],
-    support_vllm=False)
+    support_vllm=False,
+    hf_model_id='state-spaces/mamba-790m-hf')
 @register_model(
     ModelType.mamba_1_4b,
     'AI-ModelScope/mamba-1.4b-hf',
     LoRATM.mamba,
     TemplateType.default_generation,
     requires=['transformers>=4.39.0'],
-    support_vllm=False)
+    support_vllm=False,
+    hf_model_id='state-spaces/mamba-1.4b-hf')
 @register_model(
     ModelType.mamba_2_8b,
     'AI-ModelScope/mamba-2.8b-hf',
     LoRATM.mamba,
     TemplateType.default_generation,
     requires=['transformers>=4.39.0'],
-    support_vllm=False)
+    support_vllm=False,
+    hf_model_id='state-spaces/mamba-2.8b-hf')
 def get_model_tokenizer_mamba(model_dir: str,
                               torch_dtype: Optional[Dtype],
                               model_kwargs: Dict[str, Any],
@@ -622,21 +689,24 @@ def get_model_tokenizer_mamba(model_dir: str,
     LoRATM.cogvlm,
     TemplateType.cogvlm_instruct,
     support_gradient_checkpointing=False,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='THUDM/cogvlm-chat-hf')
 @register_model(
     ModelType.cogagent_18b_chat,
     'ZhipuAI/cogagent-chat',
     LoRATM.cogagent,
     TemplateType.cogagent_chat,
     support_gradient_checkpointing=False,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='THUDM/cogagent-chat-hf')
 @register_model(
     ModelType.cogagent_18b_instruct,
     'ZhipuAI/cogagent-vqa',
     LoRATM.cogagent,
     TemplateType.cogagent_instruct,
     support_gradient_checkpointing=False,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='THUDM/cogagent-vqa-hf')
 def get_model_tokenizer_cogagent(model_dir: str,
                                  torch_dtype: Dtype,
                                  model_kwargs: Dict[str, Any],
@@ -666,7 +736,8 @@ def get_model_tokenizer_cogagent(model_dir: str,
     'Shanghai_AI_Laboratory/internlm-chat-20b',
     LoRATM.llama2,
     TemplateType.internlm,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-chat-20b')
 @register_model(
     ModelType.internlm_7b_chat_8k,
     'Shanghai_AI_Laboratory/internlm-chat-7b-8k',
@@ -675,10 +746,11 @@ def get_model_tokenizer_cogagent(model_dir: str,
     support_vllm=True)
 @register_model(
     ModelType.internlm_7b_chat,
-    'Shanghai_AI_Laboratory/internlm-chat-7b-v1_1',
+    'Shanghai_AI_Laboratory/internlm-chat-7b',
     LoRATM.llama2,
     TemplateType.internlm,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm-chat-7b')
 def get_model_tokenizer_internlm_chat(model_dir: str,
                                       torch_dtype: Dtype,
                                       model_kwargs: Dict[str, Any],
@@ -699,7 +771,8 @@ def get_model_tokenizer_internlm_chat(model_dir: str,
     LoRATM.baichuan,
     TemplateType.default_generation,
     requires=['transformers<4.34'],
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='baichuan-inc/Baichuan-13B-Base')
 def get_model_tokenizer_baichuan_13b(model_dir: str,
                                      torch_dtype: Dtype,
                                      model_kwargs: Dict[str, Any],
@@ -722,13 +795,15 @@ def get_model_tokenizer_baichuan_13b(model_dir: str,
     'baichuan-inc/Baichuan2-13B-Chat',
     LoRATM.baichuan,
     TemplateType.baichuan,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='baichuan-inc/Baichuan2-13B-Chat')
 @register_model(
     ModelType.baichuan2_13b,
     'baichuan-inc/Baichuan2-13B-Base',
     LoRATM.baichuan,
     TemplateType.default_generation,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='baichuan-inc/Baichuan2-13B-Base')
 def get_model_tokenizer_baichuan2_13b(model_dir: str,
                                       torch_dtype: Dtype,
                                       model_kwargs: Dict[str, Any],
@@ -767,13 +842,15 @@ def patch_baichuan2_lm_head_forward(self, hidden_states: Tensor) -> Tensor:
     'baichuan-inc/Baichuan2-7B-Chat',
     LoRATM.baichuan,
     TemplateType.baichuan,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='baichuan-inc/Baichuan2-7B-Chat')
 @register_model(
     ModelType.baichuan2_7b,
     'baichuan-inc/Baichuan2-7B-Base',
     LoRATM.baichuan,
     TemplateType.default_generation,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='baichuan-inc/Baichuan2-7B-Base')
 def get_model_tokenizer_baichuan2(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -811,14 +888,16 @@ def get_model_tokenizer_baichuan2(model_dir: str,
         'get_baichuan2_function': get_model_tokenizer_baichuan2_13b
     },
     torch_dtype=torch.bfloat16,
-    requires=['bitsandbytes<0.41.2', 'accelerate<0.26'])
+    requires=['bitsandbytes<0.41.2', 'accelerate<0.26'],
+    hf_model_id='baichuan-inc/Baichuan2-13B-Chat-4bits')
 @register_model(
     ModelType.baichuan2_7b_chat_int4,
     'baichuan-inc/Baichuan2-7B-Chat-4bits',
     LoRATM.baichuan,
     TemplateType.baichuan,
     torch_dtype=torch.bfloat16,
-    requires=['bitsandbytes<0.41.2', 'accelerate<0.26'])
+    requires=['bitsandbytes<0.41.2', 'accelerate<0.26'],
+    hf_model_id='baichuan-inc/Baichuan2-7B-Chat-4bits')
 def get_model_tokenizer_baichuan2_int4(model_dir: str,
                                        torch_dtype: Dtype,
                                        model_kwargs: Dict[str, Any],
@@ -864,37 +943,43 @@ def remove_property(tokenizer_cls: Type[PreTrainedTokenizerBase],
     TemplateType.codefuse,
     requires=['transformers<4.34'],
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='codefuse-ai/CodeFuse-CodeGeeX2-6B')
 @register_model(
     ModelType.chatglm3_6b_32k,
     'ZhipuAI/chatglm3-6b-32k',
     LoRATM.chatglm,
     TemplateType.chatglm3,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='THUDM/chatglm3-6b-32k')
 @register_model(
     ModelType.chatglm3_6b,
     'ZhipuAI/chatglm3-6b',
     LoRATM.chatglm,
     TemplateType.chatglm3,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='THUDM/chatglm3-6b')
 @register_model(
     ModelType.chatglm3_6b_base,
     'ZhipuAI/chatglm3-6b-base',
     LoRATM.chatglm,
     TemplateType.chatglm_generation,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='THUDM/chatglm3-6b-base')
 @register_model(
     ModelType.chatglm2_6b_32k,
     'ZhipuAI/chatglm2-6b-32k',
     LoRATM.chatglm,
     TemplateType.chatglm2,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='THUDM/chatglm2-6b-32k')
 @register_model(
     ModelType.chatglm2_6b,
     'ZhipuAI/chatglm2-6b',
     LoRATM.chatglm,
     TemplateType.chatglm2,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='THUDM/chatglm2-6b')
 @register_model(
     ModelType.codegeex2_6b,
     'ZhipuAI/codegeex2-6b',
@@ -902,7 +987,8 @@ def remove_property(tokenizer_cls: Type[PreTrainedTokenizerBase],
     TemplateType.chatglm_generation,
     requires=['transformers<4.34'],
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='THUDM/codegeex2-6b')
 def get_model_tokenizer_chatglm(model_dir: str,
                                 torch_dtype: Dtype,
                                 model_kwargs: Dict[str, Any],
@@ -945,7 +1031,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     requires=['transformers>=4.38'],
     ignore_file_pattern=[r'.+\.gguf$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='google/gemma-2b')
 @register_model(
     ModelType.gemma_7b,
     'AI-ModelScope/gemma-7b',
@@ -954,7 +1041,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     requires=['transformers>=4.38'],
     ignore_file_pattern=[r'.+\.gguf$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='google/gemma-7b')
 @register_model(
     ModelType.gemma_2b_instruct,
     'AI-ModelScope/gemma-2b-it',
@@ -963,7 +1051,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     requires=['transformers>=4.38'],
     ignore_file_pattern=[r'.+\.gguf$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='google/gemma-2b-it')
 @register_model(
     ModelType.gemma_7b_instruct,
     'AI-ModelScope/gemma-7b-it',
@@ -972,7 +1061,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     requires=['transformers>=4.38'],
     ignore_file_pattern=[r'.+\.gguf$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='google/gemma-7b-it')
 @register_model(
     ModelType.deepseek_math_7b_instruct,
     'deepseek-ai/deepseek-math-7b-instruct',
@@ -980,7 +1070,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.deepseek,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['math'])
+    tags=['math'],
+    hf_model_id='deepseek-ai/deepseek-math-7b-instruct')
 @register_model(
     ModelType.deepseek_math_7b_chat,
     'deepseek-ai/deepseek-math-7b-rl',
@@ -988,7 +1079,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.deepseek,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['math'])
+    tags=['math'],
+    hf_model_id='deepseek-ai/deepseek-math-7b-rl')
 @register_model(
     ModelType.deepseek_math_7b,
     'deepseek-ai/deepseek-math-7b-base',
@@ -996,7 +1088,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['math'])
+    tags=['math'],
+    hf_model_id='deepseek-ai/deepseek-math-7b-base')
 @register_model(
     ModelType.qwen1half_0_5b,
     'qwen/Qwen1.5-0.5B',
@@ -1004,7 +1097,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-0.5B')
 @register_model(
     ModelType.qwen1half_1_8b,
     'qwen/Qwen1.5-1.8B',
@@ -1012,7 +1106,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-1.8B')
 @register_model(
     ModelType.qwen1half_4b,
     'qwen/Qwen1.5-4B',
@@ -1020,7 +1115,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-4B')
 @register_model(
     ModelType.qwen1half_7b,
     'qwen/Qwen1.5-7B',
@@ -1028,7 +1124,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-7B')
 @register_model(
     ModelType.qwen1half_14b,
     'qwen/Qwen1.5-14B',
@@ -1036,7 +1133,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-14B')
 @register_model(
     ModelType.qwen1half_32b,
     'qwen/Qwen1.5-32B',
@@ -1044,7 +1142,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-32B')
 @register_model(
     ModelType.qwen1half_72b,
     'qwen/Qwen1.5-72B',
@@ -1052,7 +1151,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-72B')
 @register_model(
     ModelType.qwen1half_moe_a2_7b,
     'qwen/Qwen1.5-MoE-A2.7B',
@@ -1060,7 +1160,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-MoE-A2.7B')
 @register_model(
     ModelType.deepseek_coder_1_3b,
     'deepseek-ai/deepseek-coder-1.3b-base',
@@ -1068,7 +1169,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='deepseek-ai/deepseek-coder-1.3b-base')
 @register_model(
     ModelType.deepseek_coder_6_7b,
     'deepseek-ai/deepseek-coder-6.7b-base',
@@ -1076,7 +1178,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='deepseek-ai/deepseek-coder-6.7b-base')
 @register_model(
     ModelType.deepseek_coder_33b,
     'deepseek-ai/deepseek-coder-33b-base',
@@ -1084,7 +1187,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='deepseek-ai/deepseek-coder-33b-base')
 @register_model(
     ModelType.deepseek_coder_1_3b_instruct,
     'deepseek-ai/deepseek-coder-1.3b-instruct',
@@ -1093,7 +1197,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     eos_token='<|EOT|>',
     support_flash_attn=True,
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='deepseek-ai/deepseek-coder-1.3b-instruct')
 @register_model(
     ModelType.deepseek_coder_6_7b_instruct,
     'deepseek-ai/deepseek-coder-6.7b-instruct',
@@ -1102,7 +1207,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     eos_token='<|EOT|>',
     support_flash_attn=True,
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='deepseek-ai/deepseek-coder-6.7b-instruct')
 @register_model(
     ModelType.deepseek_coder_33b_instruct,
     'deepseek-ai/deepseek-coder-33b-instruct',
@@ -1111,49 +1217,56 @@ def cross_entropy_forward(self, inputs: Tensor,
     eos_token='<|EOT|>',
     support_flash_attn=True,
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='deepseek-ai/deepseek-coder-33b-instruct')
 @register_model(
     ModelType.openbuddy_deepseek_67b_chat,
     'OpenBuddy/openbuddy-deepseek-67b-v15.2',
     LoRATM.llama2,
     TemplateType.openbuddy,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='OpenBuddy/openbuddy-deepseek-67b-v15.2')
 @register_model(
     ModelType.deepseek_67b_chat,
     'deepseek-ai/deepseek-llm-67b-chat',
     LoRATM.llama2,
     TemplateType.deepseek,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='deepseek-ai/deepseek-llm-67b-chat')
 @register_model(
     ModelType.deepseek_67b,
     'deepseek-ai/deepseek-llm-67b-base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='deepseek-ai/deepseek-llm-67b-base')
 @register_model(
     ModelType.deepseek_7b_chat,
     'deepseek-ai/deepseek-llm-7b-chat',
     LoRATM.llama2,
     TemplateType.deepseek,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='deepseek-ai/deepseek-llm-7b-chat')
 @register_model(
     ModelType.deepseek_7b,
     'deepseek-ai/deepseek-llm-7b-base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='deepseek-ai/deepseek-llm-7b-base')
 @register_model(
     ModelType.sus_34b_chat,
     'SUSTC/SUS-Chat-34B',
     LoRATM.llama2,
     TemplateType.sus,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='SUSTech/SUS-Chat-34B')
 @register_model(
     ModelType.openbuddy_zephyr_7b_chat,
     'OpenBuddy/openbuddy-zephyr-7b-v14.1',
@@ -1161,7 +1274,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.openbuddy,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='OpenBuddy/openbuddy-zephyr-7b-v14.1')
 @register_model(
     ModelType.zephyr_7b_beta_chat,
     'modelscope/zephyr-7b-beta',
@@ -1169,7 +1283,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.zephyr,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='HuggingFaceH4/zephyr-7b-beta')
 @register_model(
     ModelType.yi_6b_chat,
     '01ai/Yi-6B-Chat',
@@ -1177,7 +1292,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.yi,
     eos_token='<|im_end|>',
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-6B-Chat')
 @register_model(
     ModelType.yi_34b_chat,
     '01ai/Yi-34B-Chat',
@@ -1185,56 +1301,72 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.yi,
     eos_token='<|im_end|>',
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-34B-Chat')
 @register_model(
     ModelType.yi_34b_200k,
     '01ai/Yi-34B-200K',
     LoRATM.llama2,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-34B-200K')
 @register_model(
     ModelType.yi_34b,
     '01ai/Yi-34B',
     LoRATM.llama2,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-34B')
 @register_model(
     ModelType.yi_6b_200k,
     '01ai/Yi-6B-200K',
     LoRATM.llama2,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-6B-200K')
 @register_model(
     ModelType.yi_9b,
     '01ai/Yi-9B',
     LoRATM.llama2,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-9B')
+@register_model(
+    ModelType.yi_9b_200k,
+    '01ai/Yi-9B-200K',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-9B-200K')
 @register_model(
     ModelType.yi_6b,
     '01ai/Yi-6B',
     LoRATM.llama2,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-6B')
 @register_model(
     ModelType.ziya2_13b_chat,
     'Fengshenbang/Ziya2-13B-Chat',
     LoRATM.llama2,
     TemplateType.ziya,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='IDEA-CCNL/Ziya2-13B-Chat')
 @register_model(
     ModelType.ziya2_13b,
     'Fengshenbang/Ziya2-13B-Base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='IDEA-CCNL/Ziya2-13B-Base')
 @register_model(
     ModelType.openbuddy_mixtral_moe_7b_chat,
     'OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k',
@@ -1243,7 +1375,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     requires=['transformers>=4.36'],
     support_flash_attn=True,
     support_vllm=True,
-    support_gradient_checkpointing=False)
+    support_gradient_checkpointing=False,
+    hf_model_id='OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k')
 @register_model(
     ModelType.openbuddy_mistral_7b_chat,
     'OpenBuddy/openbuddy-mistral-7b-v17.1-32k',
@@ -1251,28 +1384,32 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.openbuddy,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='OpenBuddy/openbuddy-mistral-7b-v17.1-32k')
 @register_model(
     ModelType.openbuddy_llama2_70b_chat,
     'OpenBuddy/openbuddy-llama2-70b-v10.1-bf16',
     LoRATM.llama2,
     TemplateType.openbuddy,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='OpenBuddy/openbuddy-llama2-70b-v10.1-bf16')
 @register_model(
     ModelType.openbuddy_llama2_65b_chat,
     'OpenBuddy/openbuddy-llama-65b-v8-bf16',
     LoRATM.llama2,
     TemplateType.openbuddy,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='OpenBuddy/openbuddy-llama-65b-v8-bf16')
 @register_model(
     ModelType.openbuddy_llama2_13b_chat,
     'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16',
     LoRATM.llama2,
     TemplateType.openbuddy,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='OpenBuddy/openbuddy-llama2-13b-v8.1-fp16')
 @register_model(
     ModelType.mistral_7b_instruct,
     'AI-ModelScope/Mistral-7B-Instruct-v0.1',
@@ -1280,7 +1417,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.llama,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='mistralai/Mistral-7B-Instruct-v0.1')
 @register_model(
     ModelType.mistral_7b_instruct_v2,
     'AI-ModelScope/Mistral-7B-Instruct-v0.2',
@@ -1288,7 +1426,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.llama,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='mistralai/Mistral-7B-Instruct-v0.2')
 @register_model(
     ModelType.mistral_7b,
     'AI-ModelScope/Mistral-7B-v0.1',
@@ -1296,7 +1435,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation_bos,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='mistralai/Mistral-7B-v0.1')
 @register_model(
     ModelType.mistral_7b_v2,
     'AI-ModelScope/Mistral-7B-v0.2-hf',
@@ -1304,25 +1444,30 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation_bos,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='alpindale/Mistral-7B-v0.2-hf')
 @register_model(
     ModelType.mixtral_moe_7b,
     'AI-ModelScope/Mixtral-8x7B-v0.1',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     requires=['transformers>=4.36'],
+    ignore_file_pattern=[r'.+\.pt$'],
     support_flash_attn=True,
     support_vllm=True,
-    support_gradient_checkpointing=False)
+    support_gradient_checkpointing=False,
+    hf_model_id='mistralai/Mixtral-8x7B-v0.1')
 @register_model(
     ModelType.mixtral_moe_7b_instruct,
     'AI-ModelScope/Mixtral-8x7B-Instruct-v0.1',
     LoRATM.llama2,
     TemplateType.llama,
     requires=['transformers>=4.36'],
+    ignore_file_pattern=[r'.+\.pt$'],
     support_flash_attn=True,
     support_vllm=True,
-    support_gradient_checkpointing=False)
+    support_gradient_checkpointing=False,
+    hf_model_id='mistralai/Mixtral-8x7B-Instruct-v0.1')
 @register_model(
     ModelType.mixtral_moe_8x22b_v1,
     'AI-ModelScope/Mixtral-8x22B-v0.1',
@@ -1330,7 +1475,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation_bos,
     requires=['transformers>=4.36'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='mistral-community/Mixtral-8x22B-v0.1')
 @register_model(
     ModelType.dbrx_base,
     'AI-ModelScope/dbrx-base',
@@ -1339,7 +1485,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     requires=['transformers>=4.36'],
     support_flash_attn=True,
     support_vllm=True,
-    support_gradient_checkpointing=False)
+    support_gradient_checkpointing=False,
+    hf_model_id='databricks/dbrx-base')
 @register_model(
     ModelType.dbrx_instruct,
     'AI-ModelScope/dbrx-instruct',
@@ -1348,7 +1495,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     requires=['transformers>=4.36'],
     support_flash_attn=True,
     support_vllm=True,
-    support_gradient_checkpointing=False)
+    support_gradient_checkpointing=False,
+    hf_model_id='databricks/dbrx-instruct')
 def get_model_tokenizer_with_flash_attn(model_dir: str,
                                         torch_dtype: Dtype,
                                         model_kwargs: Dict[str, Any],
@@ -1381,7 +1529,8 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
     ignore_file_pattern=[r'.+\.bin$'],
     support_flash_attn=True,
     requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'],
-    support_vllm=False)
+    support_vllm=False,
+    hf_model_id='ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf')
 @register_model(
     ModelType.mixtral_moe_7b_aqlm_2bit_1x16,
     'AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf',
@@ -1390,7 +1539,8 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
     requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'],
     support_flash_attn=True,
     support_vllm=False,
-    support_gradient_checkpointing=False)
+    support_gradient_checkpointing=False,
+    hf_model_id='ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf')
 def get_model_tokenizer_aqlm(model_dir: str,
                              torch_dtype: Dtype,
                              model_kwargs: Dict[str, Any],
@@ -1415,7 +1565,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
+    requires=['transformers>=4.37', 'autoawq'],
+    hf_model_id='Qwen/Qwen1.5-0.5B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_1_8b_chat_awq,
     'qwen/Qwen1.5-1.8B-Chat-AWQ',
@@ -1424,7 +1575,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
+    requires=['transformers>=4.37', 'autoawq'],
+    hf_model_id='Qwen/Qwen1.5-1.8B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_4b_chat_awq,
     'qwen/Qwen1.5-4B-Chat-AWQ',
@@ -1433,7 +1585,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
+    requires=['transformers>=4.37', 'autoawq'],
+    hf_model_id='Qwen/Qwen1.5-4B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_7b_chat_awq,
     'qwen/Qwen1.5-7B-Chat-AWQ',
@@ -1442,7 +1595,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
+    requires=['transformers>=4.37', 'autoawq'],
+    hf_model_id='Qwen/Qwen1.5-7B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_14b_chat_awq,
     'qwen/Qwen1.5-14B-Chat-AWQ',
@@ -1451,7 +1605,18 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
+    requires=['transformers>=4.37', 'autoawq'],
+    hf_model_id='Qwen/Qwen1.5-14B-Chat-AWQ')
+@register_model(
+    ModelType.qwen1half_32b_chat_awq,
+    'qwen/Qwen1.5-32B-Chat-AWQ',
+    LoRATM.qwen1half,
+    TemplateType.qwen,
+    support_flash_attn=True,
+    support_vllm=True,
+    function_kwargs={'is_awq': True},
+    requires=['transformers>=4.37', 'autoawq'],
+    hf_model_id='Qwen/Qwen1.5-32B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_72b_chat_awq,
     'qwen/Qwen1.5-72B-Chat-AWQ',
@@ -1460,7 +1625,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
+    requires=['transformers>=4.37', 'autoawq'],
+    hf_model_id='Qwen/Qwen1.5-72B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_0_5b_chat,
     'qwen/Qwen1.5-0.5B-Chat',
@@ -1468,7 +1634,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-0.5B-Chat')
 @register_model(
     ModelType.qwen1half_1_8b_chat,
     'qwen/Qwen1.5-1.8B-Chat',
@@ -1476,7 +1643,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-1.8B-Chat')
 @register_model(
     ModelType.qwen1half_4b_chat,
     'qwen/Qwen1.5-4B-Chat',
@@ -1484,7 +1652,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-4B-Chat')
 @register_model(
     ModelType.qwen1half_7b_chat,
     'qwen/Qwen1.5-7B-Chat',
@@ -1492,7 +1661,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-7B-Chat')
 @register_model(
     ModelType.qwen1half_14b_chat,
     'qwen/Qwen1.5-14B-Chat',
@@ -1500,7 +1670,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-14B-Chat')
 @register_model(
     ModelType.qwen1half_32b_chat,
     'qwen/Qwen1.5-32B-Chat',
@@ -1508,7 +1679,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-32B-Chat')
 @register_model(
     ModelType.qwen1half_72b_chat,
     'qwen/Qwen1.5-72B-Chat',
@@ -1516,7 +1688,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-72B-Chat')
 @register_model(
     ModelType.qwen1half_moe_a2_7b_chat,
     'qwen/Qwen1.5-MoE-A2.7B-Chat',
@@ -1524,7 +1697,8 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/Qwen1.5-MoE-A2.7B-Chat')
 def get_model_tokenizer_qwen1half(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -1556,7 +1730,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4')
 @register_model(
     ModelType.qwen1half_0_5b_chat_int8,
     'qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8',
@@ -1565,7 +1740,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8')
 @register_model(
     ModelType.qwen1half_1_8b_chat_int4,
     'qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4',
@@ -1575,7 +1751,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4')
 @register_model(
     ModelType.qwen1half_1_8b_chat_int8,
     'qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8',
@@ -1584,7 +1761,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8')
 @register_model(
     ModelType.qwen1half_4b_chat_int4,
     'qwen/Qwen1.5-4B-Chat-GPTQ-Int4',
@@ -1594,7 +1772,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen1.5-4B-Chat-GPTQ-Int4')
 @register_model(
     ModelType.qwen1half_4b_chat_int8,
     'qwen/Qwen1.5-4B-Chat-GPTQ-Int8',
@@ -1603,7 +1782,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen1.5-4B-Chat-GPTQ-Int8')
 @register_model(
     ModelType.qwen1half_7b_chat_int4,
     'qwen/Qwen1.5-7B-Chat-GPTQ-Int4',
@@ -1613,7 +1793,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen1.5-7B-Chat-GPTQ-Int4')
 @register_model(
     ModelType.qwen1half_7b_chat_int8,
     'qwen/Qwen1.5-7B-Chat-GPTQ-Int8',
@@ -1622,7 +1803,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen1.5-7B-Chat-GPTQ-Int8')
 @register_model(
     ModelType.qwen1half_14b_chat_int4,
     'qwen/Qwen1.5-14B-Chat-GPTQ-Int4',
@@ -1632,7 +1814,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen1.5-14B-Chat-GPTQ-Int4')
 @register_model(
     ModelType.qwen1half_14b_chat_int8,
     'qwen/Qwen1.5-14B-Chat-GPTQ-Int8',
@@ -1641,7 +1824,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen1.5-14B-Chat-GPTQ-Int8')
 @register_model(
     ModelType.qwen1half_32b_chat_int4,
     'qwen/Qwen1.5-32B-Chat-GPTQ-Int4',
@@ -1651,7 +1835,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen1.5-32B-Chat-GPTQ-Int4')
 @register_model(
     ModelType.qwen1half_72b_chat_int4,
     'qwen/Qwen1.5-72B-Chat-GPTQ-Int4',
@@ -1661,7 +1846,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen1.5-72B-Chat-GPTQ-Int4')
 @register_model(
     ModelType.qwen1half_72b_chat_int8,
     'qwen/Qwen1.5-72B-Chat-GPTQ-Int8',
@@ -1670,7 +1856,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen1.5-72B-Chat-GPTQ-Int8')
 @register_model(
     ModelType.qwen1half_moe_a2_7b_chat_int4,
     'qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4',
@@ -1679,7 +1866,8 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4')
 def get_model_tokenizer_qwen1half_intx(model_dir: str,
                                        torch_dtype: Dtype,
                                        model_kwargs: Dict[str, Any],
@@ -1696,7 +1884,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     LoRATM.internlm2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-1_8b')
 @register_model(
     ModelType.internlm2_1_8b_sft_chat,
     'Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft',
@@ -1704,7 +1893,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     TemplateType.internlm2,
     eos_token='<|im_end|>',
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-chat-1_8b-sft')
 @register_model(
     ModelType.internlm2_1_8b_chat,
     'Shanghai_AI_Laboratory/internlm2-chat-1_8b',
@@ -1712,7 +1902,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     TemplateType.internlm2,
     eos_token='<|im_end|>',
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-chat-1_8b')
 @register_model(
     ModelType.internlm2_math_7b,
     'Shanghai_AI_Laboratory/internlm2-math-base-7b',
@@ -1720,7 +1911,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['math'])
+    tags=['math'],
+    hf_model_id='internlm/internlm2-math-base-7b')
 @register_model(
     ModelType.internlm2_math_20b,
     'Shanghai_AI_Laboratory/internlm2-math-base-20b',
@@ -1728,7 +1920,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['math'])
+    tags=['math'],
+    hf_model_id='internlm/internlm2-math-base-20b')
 @register_model(
     ModelType.internlm2_math_7b_chat,
     'Shanghai_AI_Laboratory/internlm2-math-7b',
@@ -1737,7 +1930,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     eos_token='<|im_end|>',
     support_flash_attn=True,
     support_vllm=True,
-    tags=['math'])
+    tags=['math'],
+    hf_model_id='internlm/internlm2-math-7b')
 @register_model(
     ModelType.internlm2_math_20b_chat,
     'Shanghai_AI_Laboratory/internlm2-math-20b',
@@ -1746,7 +1940,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     eos_token='<|im_end|>',
     support_flash_attn=True,
     support_vllm=True,
-    tags=['math'])
+    tags=['math'],
+    hf_model_id='internlm/internlm2-math-20b')
 @register_model(
     ModelType.internlm2_7b_sft_chat,
     'Shanghai_AI_Laboratory/internlm2-chat-7b-sft',
@@ -1754,7 +1949,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     TemplateType.internlm2,
     eos_token='<|im_end|>',
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-chat-7b-sft')
 @register_model(
     ModelType.internlm2_7b_chat,
     'Shanghai_AI_Laboratory/internlm2-chat-7b',
@@ -1762,7 +1958,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     TemplateType.internlm2,
     eos_token='<|im_end|>',
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-chat-7b')
 @register_model(
     ModelType.internlm2_20b_sft_chat,
     'Shanghai_AI_Laboratory/internlm2-chat-20b-sft',
@@ -1770,7 +1967,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     TemplateType.internlm2,
     eos_token='<|im_end|>',
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-chat-20b-sft')
 @register_model(
     ModelType.internlm2_20b_chat,
     'Shanghai_AI_Laboratory/internlm2-chat-20b',
@@ -1778,35 +1976,40 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str,
     TemplateType.internlm2,
     eos_token='<|im_end|>',
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-chat-20b')
 @register_model(
     ModelType.internlm2_7b,
     'Shanghai_AI_Laboratory/internlm2-7b',
     LoRATM.internlm2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-7b')
 @register_model(
     ModelType.internlm2_7b_base,
     'Shanghai_AI_Laboratory/internlm2-base-7b',
     LoRATM.internlm2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-base-7b')
 @register_model(
     ModelType.internlm2_20b,
     'Shanghai_AI_Laboratory/internlm2-20b',
     LoRATM.internlm2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-20b')
 @register_model(
     ModelType.internlm2_20b_base,
     'Shanghai_AI_Laboratory/internlm2-base-20b',
     LoRATM.internlm2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='internlm/internlm2-base-20b')
 def get_model_tokenizer_internlm2(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -1841,7 +2044,8 @@ def get_model_tokenizer_internlm2(model_dir: str,
     TemplateType.internlm_xcomposer2,
     eos_token='[UNUSED_TOKEN_145]',
     support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='internlm/internlm-xcomposer2-7b')
 def get_model_tokenizer_internlm_xcomposer2(model_dir: str,
                                             torch_dtype: Dtype,
                                             model_kwargs: Dict[str, Any],
@@ -1951,14 +2155,16 @@ def _patch_deepseek_vl(model) -> None:
     LoRATM.llama2,
     TemplateType.deepseek_vl,
     support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='deepseek-ai/deepseek-vl-7b-chat')
 @register_model(
     ModelType.deepseek_vl_1_3b_chat,
     'deepseek-ai/deepseek-vl-1.3b-chat',
     LoRATM.llama2,
     TemplateType.deepseek_vl,
     support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='deepseek-ai/deepseek-vl-1.3b-chat')
 def get_model_tokenizer_deepseek_vl(model_dir: str,
                                     torch_dtype: Dtype,
                                     model_kwargs: Dict[str, Any],
@@ -2007,7 +2213,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     TemplateType.default_generation_bos,
     ignore_file_pattern=[r'.+\.bin$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='meta-llama/Llama-2-7b-hf')
 @register_model(
     ModelType.llama2_13b,
     'modelscope/Llama-2-13b-ms',
@@ -2015,7 +2222,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     TemplateType.default_generation_bos,
     ignore_file_pattern=[r'.+\.bin$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='meta-llama/Llama-2-13b-hf')
 @register_model(
     ModelType.llama2_70b,
     'modelscope/Llama-2-70b-ms',
@@ -2023,7 +2231,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     TemplateType.default_generation_bos,
     ignore_file_pattern=[r'.+\.bin$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='meta-llama/Llama-2-70b-hf')
 @register_model(
     ModelType.llama2_7b_chat,
     'modelscope/Llama-2-7b-chat-ms',
@@ -2031,7 +2240,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='meta-llama/Llama-2-7b-chat-hf')
 @register_model(
     ModelType.llama2_13b_chat,
     'modelscope/Llama-2-13b-chat-ms',
@@ -2039,7 +2249,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='meta-llama/Llama-2-13b-chat-hf')
 @register_model(
     ModelType.llama2_70b_chat,
     'modelscope/Llama-2-70b-chat-ms',
@@ -2047,7 +2258,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='meta-llama/Llama-2-70b-chat-hf')
 def get_model_tokenizer_llama2(model_dir: str,
                                torch_dtype: Dtype,
                                model_kwargs: Dict[str, Any],
@@ -2065,8 +2277,12 @@ def get_model_tokenizer_llama2(model_dir: str,
         **kwargs)
 
 
-@register_model(ModelType.polylm_13b, 'damo/nlp_polylm_13b_text_generation',
-                LoRATM.polylm, TemplateType.default_generation)
+@register_model(
+    ModelType.polylm_13b,
+    'damo/nlp_polylm_13b_text_generation',
+    LoRATM.polylm,
+    TemplateType.default_generation,
+    hf_model_id='DAMO-NLP-MT/polylm-13b')
 def get_model_tokenizer_polylm(model_dir: str,
                                torch_dtype: Dtype,
                                model_kwargs: Dict[str, Any],
@@ -2139,21 +2355,24 @@ def get_model_tokenizer_qwen(model_dir: str,
     TemplateType.codefuse,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='codefuse-ai/CodeFuse-QWen-14B')
 @register_model(
     ModelType.qwen_1_8b,
     'qwen/Qwen-1_8B',
     LoRATM.qwen,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen1.5-1.8B')
 @register_model(
     ModelType.qwen_72b,
     'qwen/Qwen-72B',
     LoRATM.qwen,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-72B')
 @register_model(
     ModelType.tongyi_finance_14b,
     'TongyiFinance/Tongyi-Finance-14B',
@@ -2168,14 +2387,16 @@ def get_model_tokenizer_qwen(model_dir: str,
     LoRATM.qwen,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-14B')
 @register_model(
     ModelType.qwen_7b,
     'qwen/Qwen-7B',
     LoRATM.qwen,
     TemplateType.default_generation,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-7B')
 def get_model_tokenizer_qwen_base(*args, **kwargs):
     model, tokenizer = get_model_tokenizer_qwen(*args, **kwargs)
     tokenizer.eos_token_id = tokenizer.eod_id
@@ -2188,14 +2409,16 @@ def get_model_tokenizer_qwen_base(*args, **kwargs):
     LoRATM.qwen,
     TemplateType.qwen,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-1_8B-Chat')
 @register_model(
     ModelType.qwen_72b_chat,
     'qwen/Qwen-72B-Chat',
     LoRATM.qwen,
     TemplateType.qwen,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-72B-Chat')
 @register_model(
     ModelType.tongyi_finance_14b_chat,
     'TongyiFinance/Tongyi-Finance-14B-Chat',
@@ -2203,21 +2426,24 @@ def get_model_tokenizer_qwen_base(*args, **kwargs):
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['financial'])
+    tags=['financial'],
+    hf_model_id='jxy/Tongyi-Finance-14B-Chat')
 @register_model(
     ModelType.qwen_14b_chat,
     'qwen/Qwen-14B-Chat',
     LoRATM.qwen,
     TemplateType.qwen,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-14B-Chat')
 @register_model(
     ModelType.qwen_7b_chat,
     'qwen/Qwen-7B-Chat',
     LoRATM.qwen,
     TemplateType.qwen,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-7B-Chat')
 def get_model_tokenizer_qwen_chat(*args, **kwargs):
     model, tokenizer = get_model_tokenizer_qwen(*args, **kwargs)
     tokenizer.eos_token_id = tokenizer.im_end_id
@@ -2279,7 +2505,8 @@ def _qwen_vl_audio_decode(self,
     LoRATM.qwen,
     TemplateType.qwen,
     support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='Qwen/Qwen-VL-Chat')
 @register_model(
     ModelType.qwen_vl,
     'qwen/Qwen-VL',
@@ -2287,7 +2514,8 @@ def _qwen_vl_audio_decode(self,
     TemplateType.default_generation,
     function_kwargs={'get_qwen_function': get_model_tokenizer_qwen_base},
     support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='Qwen/Qwen-VL')
 def get_model_tokenizer_qwen_vl(model_dir: str,
                                 torch_dtype: Dtype,
                                 model_kwargs: Dict[str, Any],
@@ -2341,7 +2569,8 @@ def get_model_tokenizer_qwen_vl(model_dir: str,
     TemplateType.qwen_audio,
     support_flash_attn=True,
     function_kwargs={'get_qwen_function': get_model_tokenizer_qwen_chat},
-    tags=['multi-modal', 'audio'])
+    tags=['multi-modal', 'audio'],
+    hf_model_id='Qwen/Qwen-Audio-Chat')
 @register_model(
     ModelType.qwen_audio,
     'qwen/Qwen-Audio',
@@ -2349,7 +2578,8 @@ def get_model_tokenizer_qwen_vl(model_dir: str,
     TemplateType.qwen_audio_generation,
     support_flash_attn=True,
     function_kwargs={'get_qwen_function': get_model_tokenizer_qwen_base},
-    tags=['multi-modal', 'audio'])
+    tags=['multi-modal', 'audio'],
+    hf_model_id='Qwen/Qwen-Audio')
 def get_model_tokenizer_qwen_audio(model_dir: str,
                                    torch_dtype: Dtype,
                                    model_kwargs: Dict[str, Any],
@@ -2382,7 +2612,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen-1_8B-Chat-Int8')
 @register_model(
     ModelType.qwen_1_8b_chat_int4,
     'qwen/Qwen-1_8B-Chat-Int4',
@@ -2392,7 +2623,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-1_8B-Chat-Int4')
 @register_model(
     ModelType.qwen_72b_chat_int8,
     'qwen/Qwen-72B-Chat-Int8',
@@ -2401,7 +2633,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen-72B-Chat-Int8')
 @register_model(
     ModelType.qwen_72b_chat_int4,
     'qwen/Qwen-72B-Chat-Int4',
@@ -2411,7 +2644,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-72B-Chat-Int4')
 @register_model(
     ModelType.tongyi_finance_14b_chat_int4,
     'TongyiFinance/Tongyi-Finance-14B-Chat-Int4',
@@ -2422,7 +2656,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
     support_vllm=True,
-    tags=['financial'])
+    tags=['financial'],
+    hf_model_id='jxy/Tongyi-Finance-14B-Chat-Int4')
 @register_model(
     ModelType.qwen_vl_chat_int4,
     'qwen/Qwen-VL-Chat-Int4',
@@ -2435,7 +2670,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
         'bits': 4
     },
     support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='Qwen/Qwen-VL-Chat-Int4')
 @register_model(
     ModelType.qwen_14b_chat_int8,
     'qwen/Qwen-14B-Chat-Int8',
@@ -2444,7 +2680,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen-14B-Chat-Int8')
 @register_model(
     ModelType.qwen_7b_chat_int8,
     'qwen/Qwen-7B-Chat-Int8',
@@ -2453,7 +2690,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Qwen/Qwen-7B-Chat-Int8')
 @register_model(
     ModelType.qwen_14b_chat_int4,
     'qwen/Qwen-14B-Chat-Int4',
@@ -2463,7 +2701,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-14B-Chat-Int4')
 @register_model(
     ModelType.qwen_7b_chat_int4,
     'qwen/Qwen-7B-Chat-Int4',
@@ -2473,7 +2712,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 4},
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='Qwen/Qwen-7B-Chat-Int4')
 def get_model_tokenizer_qwen_intx(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -2511,9 +2751,13 @@ def _new_forward(self, x):
     return model, tokenizer
 
 
-register_model(ModelType.skywork_13b, 'skywork/Skywork-13B-base',
-               LoRATM.llama2, TemplateType.default_generation_bos,
-               get_model_tokenizer_from_repo)
+register_model(
+    ModelType.skywork_13b,
+    'skywork/Skywork-13B-base',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    get_model_tokenizer_from_repo,
+    hf_model_id='Skywork/Skywork-13B-base')
 
 
 @register_model(ModelType.skywork_13b_chat, 'skywork/Skywork-13B-chat',
@@ -2539,7 +2783,8 @@ def get_skywork_model_tokenizer(model_dir: str,
     TemplateType.codefuse_codellama,
     support_flash_attn=True,
     support_vllm=True,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='codefuse-ai/CodeFuse-CodeLlama-34B')
 def get_model_tokenizer_codellama(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -2564,13 +2809,15 @@ def get_model_tokenizer_codellama(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     support_gradient_checkpointing=False,
-    tags=['coding'])
+    tags=['coding'],
+    hf_model_id='microsoft/phi-2')
 @register_model(
     ModelType.telechat_12b,
     'TeleAI/TeleChat-12B',
     LoRATM.telechat,
     TemplateType.telechat,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Tele-AI/TeleChat-12B')
 def get_model_tokenizer_phi(model_dir: str,
                             torch_dtype: Dtype,
                             model_kwargs: Dict[str, Any],
@@ -2594,7 +2841,8 @@ def get_model_tokenizer_phi(model_dir: str,
     'TeleAI/TeleChat-7B',
     LoRATM.telechat,
     TemplateType.telechat,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Tele-AI/telechat-7B')
 def get_model_tokenizer_telechat(model_dir: str,
                                  torch_dtype: Dtype,
                                  model_kwargs: Dict[str, Any],
@@ -2624,14 +2872,16 @@ def get_model_tokenizer_telechat(model_dir: str,
     LoRATM.llama2,
     TemplateType.deepseek,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='deepseek-ai/deepseek-moe-16b-chat')
 @register_model(
     ModelType.deepseek_moe_16b,
     'deepseek-ai/deepseek-moe-16b-base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='deepseek-ai/deepseek-moe-16b-base')
 @register_model(
     ModelType.minicpm_moe_8x2b,
     'OpenBMB/MiniCPM-MoE-8x2B',
@@ -2639,7 +2889,8 @@ def get_model_tokenizer_telechat(model_dir: str,
     TemplateType.minicpm,
     requires=['transformers>=4.36.0'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='openbmb/MiniCPM-MoE-8x2B')
 def get_model_tokenizer_deepseek_moe(model_dir: str,
                                      torch_dtype: Dtype,
                                      model_kwargs: Dict[str, Any],
@@ -2677,25 +2928,29 @@ def _new_forward(hidden_states, *,
     'YuanLLM/Yuan2.0-2B-hf',
     LoRATM.llama2,
     TemplateType.yuan,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='IEITYuan/Yuan2-2B-hf')
 @register_model(
     ModelType.yuan2_51b_instruct,
     'YuanLLM/Yuan2.0-51B-hf',
     LoRATM.llama2,
     TemplateType.yuan,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='IEITYuan/Yuan2-51B-hf')
 @register_model(
     ModelType.yuan2_102b_instruct,
     'YuanLLM/Yuan2.0-102B-hf',
     LoRATM.llama2,
     TemplateType.yuan,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='IEITYuan/Yuan2-102B-hf')
 @register_model(
     ModelType.yuan2_2b_janus_instruct,
     'YuanLLM/Yuan2-2B-Janus-hf',
     LoRATM.llama2,
     TemplateType.yuan,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='IEITYuan/Yuan2-2B-Janus-hf')
 def get_model_tokenizer_yuan(model_dir: str,
                              torch_dtype: Dtype,
                              model_kwargs: Dict[str, Any],
@@ -2743,13 +2998,15 @@ def get_model_tokenizer_yuan(model_dir: str,
     'OrionStarAI/Orion-14B-Base',
     LoRATM.llama2,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='OrionStarAI/Orion-14B-Base')
 @register_model(
     ModelType.orion_14b_chat,
     'OrionStarAI/Orion-14B-Chat',
     LoRATM.llama2,
     TemplateType.orion,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='OrionStarAI/Orion-14B-Chat')
 def get_model_tokenizer_orion(model_dir: str,
                               torch_dtype: Dtype,
                               model_kwargs: Dict[str, Any],
@@ -2774,7 +3031,8 @@ def get_model_tokenizer_orion(model_dir: str,
     TemplateType.yi_vl,
     support_flash_attn=True,
     requires=['transformers>=4.34'],
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='01-ai/Yi-VL-34B')
 @register_model(
     ModelType.yi_vl_6b_chat,
     '01ai/Yi-VL-6B',
@@ -2782,7 +3040,8 @@ def get_model_tokenizer_orion(model_dir: str,
     TemplateType.yi_vl,
     support_flash_attn=True,
     requires=['transformers>=4.34'],
-    tags=['multi-modal', 'vision'])
+    tags=['multi-modal', 'vision'],
+    hf_model_id='01-ai/Yi-VL-6B')
 def get_model_tokenizer_yi_vl(model_dir: str,
                               torch_dtype: Dtype,
                               model_kwargs: Dict[str, Any],
@@ -2825,14 +3084,16 @@ def get_model_tokenizer_yi_vl(model_dir: str,
     LoRATM.llama2,
     TemplateType.minicpm,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='openbmb/MiniCPM-2B-sft-fp32')
 @register_model(
     ModelType.minicpm_2b_chat,
     'OpenBMB/MiniCPM-2B-dpo-fp32',
     LoRATM.llama2,
     TemplateType.minicpm,
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='openbmb/MiniCPM-2B-dpo-fp32')
 @register_model(
     ModelType.minicpm_1b_sft_chat,
     'OpenBMB/MiniCPM-1B-sft-bf16',
@@ -2840,7 +3101,8 @@ def get_model_tokenizer_yi_vl(model_dir: str,
     TemplateType.minicpm,
     requires=['transformers>=4.36.0'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='openbmb/MiniCPM-1B-sft-bf16')
 @register_model(
     ModelType.minicpm_2b_128k,
     'OpenBMB/MiniCPM-2B-128k',
@@ -2848,7 +3110,8 @@ def get_model_tokenizer_yi_vl(model_dir: str,
     TemplateType.chatml,
     requires=['transformers>=4.36.0'],
     support_flash_attn=True,
-    support_vllm=True)
+    support_vllm=True,
+    hf_model_id='openbmb/MiniCPM-2B-128k')
 def get_model_tokenizer_minicpm(model_dir: str,
                                 torch_dtype: Dtype,
                                 model_kwargs: Dict[str, Any],
@@ -2873,13 +3136,15 @@ def get_model_tokenizer_minicpm(model_dir: str,
     'OpenBMB/MiniCPM-V',
     LoRATM.llama2,
     TemplateType.minicpm_v,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='openbmb/MiniCPM-V')
 @register_model(
     ModelType.minicpm_v_v2,
     'OpenBMB/MiniCPM-V-2',
     LoRATM.llama2,
     TemplateType.minicpm_v,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='openbmb/MiniCPM-V-2')
 def get_model_tokenizer_minicpm_v(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -2911,6 +3176,16 @@ def _new_generate(inputs=None, *args, **kwargs):
     model.generate = _new_generate
 
 
+@register_model(
+    ModelType.llava1d6_yi_34b_instruct,
+    'AI-ModelScope/llava-v1.6-34b',
+    LoRATM.llama2,
+    TemplateType.llava_yi_instruct,
+    eos_token='<|im_end|>',
+    support_flash_attn=True,
+    function_kwargs={'llm_model_type': 'llama'},
+    tags=['multi-modal', 'vision'],
+    hf_model_id='liuhaotian/llava-v1.6-34b')
 @register_model(
     ModelType.llava1d6_mistral_7b_instruct,
     'AI-ModelScope/llava-v1.6-mistral-7b',
@@ -2918,7 +3193,9 @@ def _new_generate(inputs=None, *args, **kwargs):
     TemplateType.llava_mistral_instruct,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
+    function_kwargs={'llm_model_type': 'mistral'},
+    tags=['multi-modal', 'vision'],
+    hf_model_id='liuhaotian/llava-v1.6-mistral-7b')
 def get_model_tokenizer_llava(model_dir: str,
                               torch_dtype: Dtype,
                               model_kwargs: Dict[str, Any],
@@ -2928,8 +3205,26 @@ def get_model_tokenizer_llava(model_dir: str,
         'https://github.com/haotian-liu/LLaVA.git')
     sys.path.append(os.path.join(local_repo_path))
 
-    from llava.model import LlavaMistralForCausalLM, LlavaMistralConfig
-    model_config = LlavaMistralConfig.from_pretrained(model_dir)
+    llm_model_type = kwargs.pop('llm_model_type')
+    if llm_model_type == 'mistral':
+        from llava.model import LlavaMistralForCausalLM, LlavaMistralConfig
+        model_config = LlavaMistralConfig.from_pretrained(model_dir)
+        automodel_class = LlavaMistralForCausalLM
+    else:  # llama
+        from llava.model import LlavaLlamaForCausalLM, LlavaConfig
+        if not hasattr(LlavaLlamaForCausalLM,
+                       '__old_forward'):  # Avoid double patching
+            forward = LlavaLlamaForCausalLM.forward
+            LlavaLlamaForCausalLM.__old_forward = forward
+
+            @wraps(forward)
+            def _new_forward(*args, **kwargs):
+                kwargs.pop('cache_position', None)
+                return forward(*args, **kwargs)
+
+            LlavaLlamaForCausalLM.forward = _new_forward
+        model_config = LlavaConfig.from_pretrained(model_dir)
+        automodel_class = LlavaLlamaForCausalLM
     model_config.mm_vision_tower = snapshot_download(
         'AI-ModelScope/clip-vit-large-patch14-336')
     model, tokenizer = get_model_tokenizer_with_flash_attn(
@@ -2938,7 +3233,7 @@ def get_model_tokenizer_llava(model_dir: str,
         model_kwargs,
         load_model,
         model_config=model_config,
-        automodel_class=LlavaMistralForCausalLM,
+        automodel_class=automodel_class,
         **kwargs)
 
     model.resize_token_embeddings(len(tokenizer))
@@ -2952,55 +3247,6 @@ def get_model_tokenizer_llava(model_dir: str,
     return model, tokenizer
 
 
-@register_model(
-    ModelType.llava1d6_yi_34b_instruct,
-    'AI-ModelScope/llava-v1.6-34b',
-    LoRATM.llama2,
-    TemplateType.llava_yi_instruct,
-    eos_token='<|im_end|>',
-    support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
-def get_model_tokenizer_llava_34b(model_dir: str,
-                                  torch_dtype: Dtype,
-                                  model_kwargs: Dict[str, Any],
-                                  load_model: bool = True,
-                                  **kwargs):
-    local_repo_path = _git_clone_github(
-        'https://github.com/haotian-liu/LLaVA.git')
-    sys.path.append(os.path.join(local_repo_path))
-
-    from llava.model import LlavaLlamaForCausalLM, LlavaConfig
-    forward = LlavaLlamaForCausalLM.forward
-    LlavaLlamaForCausalLM.__old_forward = forward
-
-    @wraps(forward)
-    def _new_forward(*args, **kwargs):
-        kwargs.pop('cache_position', None)
-        return forward(*args, **kwargs)
-
-    LlavaLlamaForCausalLM.forward = _new_forward
-    model_config = LlavaConfig.from_pretrained(model_dir)
-    model_config.mm_vision_tower = snapshot_download(
-        'AI-ModelScope/clip-vit-large-patch14-336')
-    model, tokenizer = get_model_tokenizer_with_flash_attn(
-        model_dir,
-        torch_dtype,
-        model_kwargs,
-        load_model,
-        model_config=model_config,
-        automodel_class=LlavaLlamaForCausalLM,
-        **kwargs)
-    model.resize_token_embeddings(len(tokenizer))
-    vision_tower = model.get_vision_tower()
-    device_map = str(model_kwargs.get('device_map', str(model.device)))
-    if not vision_tower.is_loaded:
-        vision_tower.load_model(device_map=device_map)
-    if not hasattr(model.config, 'max_sequence_length'):
-        model.config.max_sequence_length = 2048
-    _patch_llava(model)
-    return model, tokenizer
-
-
 @register_model(
     ModelType.mplug_owl2_chat,
     'iic/mPLUG-Owl2',
@@ -3011,7 +3257,8 @@ def _new_forward(*args, **kwargs):
     function_kwargs={
         'get_model_tokenizer_function': get_model_tokenizer_with_flash_attn
     },
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='MAGAer13/mplug-owl2-llama2-7b')
 @register_model(
     ModelType.mplug_owl2d1_chat,
     'iic/mPLUG-Owl2.1',
@@ -3023,7 +3270,8 @@ def _new_forward(*args, **kwargs):
         'vocab_size': 151851,
         'get_model_tokenizer_function': get_model_tokenizer_qwen
     },
-    support_flash_attn=True)
+    support_flash_attn=True,
+    hf_model_id='Mizukiluke/mplug_owl_2_1')
 def get_model_tokenizer_mplug_owl2(model_dir: str,
                                    torch_dtype: Dtype,
                                    model_kwargs: Dict[str, Any],
@@ -3109,7 +3357,7 @@ def safe_snapshot_download(model_type: str,
         dist.barrier()
     if model_id_or_path is not None and not os.path.exists(model_id_or_path):
         revision = model_info['revision']
-        use_hf = model_info['use_hf']
+        use_hf = strtobool(os.environ.get('USE_HF', 'False'))
         ignore_file_pattern = model_info['ignore_file_pattern']
         if use_hf:
             logger.info(

From dcf38555d98a0f7df3fd78fa0f311055349a4ab6 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 17:20:55 +0800
Subject: [PATCH 03/26] fix qwen-vl bnb bug

---
 swift/llm/utils/model.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 2cf09e95c9..6d4c86842b 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -2527,6 +2527,15 @@ def get_model_tokenizer_qwen_vl(model_dir: str,
         model_kwargs['quantization_config'].llm_int8_skip_modules = [
             'lm_head', 'attn_pool.attn'
         ]
+        _TransformerBlock = get_class_from_dynamic_module(
+            'visual.TransformerBlock', model_dir)
+
+        def _get_cast_dtype(self) -> torch.dtype:
+            return self.resblocks[0].ln_1.weight.dtype
+
+        _TransformerBlock.__old_get_cast_dtype = _TransformerBlock.get_cast_dtype
+        _TransformerBlock.get_cast_dtype = _get_cast_dtype
+
     get_qwen_function = kwargs.pop('get_qwen_function',
                                    get_model_tokenizer_qwen_chat)
     tokenizer_config = get_tokenizer_config(model_dir)

From 693082f2aba98ce558cc423db98e9e1dadefd0e7 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 17:51:21 +0800
Subject: [PATCH 04/26] fix gptq quant bug

---
 swift/llm/export.py |  2 +-
 swift/llm/infer.py  | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/swift/llm/export.py b/swift/llm/export.py
index 09a701cdd3..0fe892b8e7 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -145,7 +145,7 @@ def gptq_model_quantize(model, tokenizer):
     global _args
     logger.info(f'Quantization dataset: {_args.dataset}')
     gptq_quantizer = GPTQQuantizer(
-        bits=_args.quant_bits, dataset=_args.dataset)
+        bits=_args.quant_bits, dataset=','.join(_args.dataset))
     _origin_get_dataset = quantizer.get_dataset
     quantizer.get_dataset = _get_dataset
     logger.info('Start quantizing the model...')
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 88d178cc3d..b6c86e91f7 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -107,10 +107,13 @@ def merge_lora(args: InferArguments,
     model_kwargs = {}
     if is_torch_npu_available():
         logger.info(f'device_count: {torch.npu.device_count()}')
-        device_map = 'npu:0'
+        if device_map is None:
+            device_map = 'npu:0'
     else:
         logger.info(f'device_count: {torch.cuda.device_count()}')
-        device_map = 'auto'
+        if device_map is None:
+            device_map = 'auto'
+    if device_map == 'auto':
         model_kwargs['low_cpu_mem_usage'] = True
     model_kwargs['device_map'] = device_map
 
@@ -150,10 +153,13 @@ def prepare_model_template(
     model_kwargs = {}
     if is_torch_npu_available():
         logger.info(f'device_count: {torch.npu.device_count()}')
-        device_map = 'npu:0'
+        if device_map is None:
+            device_map = 'npu:0'
     else:
         logger.info(f'device_count: {torch.cuda.device_count()}')
-        device_map = 'auto'
+        if device_map is None:
+            device_map = 'auto'
+    if device_map == 'auto':
         model_kwargs['low_cpu_mem_usage'] = True
     model_kwargs['device_map'] = device_map
 

From 96772ad76187ea8a4fe4710e6064b50bb9227bda Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 18:36:00 +0800
Subject: [PATCH 05/26]  fix gptq quant bug

---
 docs/source/Multi-Modal/index.md |  12 +--
 swift/llm/dpo.py                 |   3 +-
 swift/llm/export.py              |   2 +-
 swift/llm/infer.py               |  14 +++-
 swift/llm/utils/model.py         | 123 +++++++++++++++----------------
 5 files changed, 79 insertions(+), 75 deletions(-)

diff --git a/docs/source/Multi-Modal/index.md b/docs/source/Multi-Modal/index.md
index b25b49e3e1..50475ace57 100644
--- a/docs/source/Multi-Modal/index.md
+++ b/docs/source/Multi-Modal/index.md
@@ -5,9 +5,9 @@
 1. [Qwen-VL最佳实践](qwen-vl最佳实践.md)
 2. [Qwen-Audio最佳实践](qwen-audio最佳实践.md)
 3. [Llava最佳实践](llava最佳实践.md)
-4. [Deepseek-VL最佳实践](../Multi-Modal/deepseek-vl最佳实践.md)
-5. [Yi-VL最佳实践.md](../Multi-Modal/yi-vl最佳实践.md)
-6. [Internlm2-Xcomposers最佳实践](../Multi-Modal/internlm-xcomposer2最佳实践.md)
-7. [MiniCPM-V最佳实践](../Multi-Modal/minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](../Multi-Modal/minicpm-v-2最佳实践.md)
-8. [CogVLM最佳实践](../Multi-Modal/cogvlm最佳实践.md)
-9. [mPLUG-Owl2最佳实践](../Multi-Modal/mplug-owl2最佳实践.md)
+4. [Deepseek-VL最佳实践](deepseek-vl最佳实践.md)
+5. [Yi-VL最佳实践.md](yi-vl最佳实践.md)
+6. [Internlm2-Xcomposers最佳实践](internlm-xcomposer2最佳实践.md)
+7. [MiniCPM-V最佳实践](minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](minicpm-v-2最佳实践.md)
+8. [CogVLM最佳实践](cogvlm最佳实践.md)
+9. [mPLUG-Owl2最佳实践](mplug-owl2最佳实践.md)
diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py
index a43a46a724..58e6de18cc 100644
--- a/swift/llm/dpo.py
+++ b/swift/llm/dpo.py
@@ -12,10 +12,9 @@
 from swift.utils import (check_json_format, get_dist_setting, get_logger,
                          get_main, get_model_info, is_ddp_plus_mp, is_dist,
                          is_master, plot_images, seed_everything, show_layers)
-from . import get_time_info
 from .tuner import prepare_model
 from .utils import (DPOArguments, Template, get_dataset, get_model_tokenizer,
-                    get_template, set_generation_config)
+                    get_template, get_time_info, set_generation_config)
 
 logger = get_logger()
 
diff --git a/swift/llm/export.py b/swift/llm/export.py
index 09a701cdd3..0fe892b8e7 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -145,7 +145,7 @@ def gptq_model_quantize(model, tokenizer):
     global _args
     logger.info(f'Quantization dataset: {_args.dataset}')
     gptq_quantizer = GPTQQuantizer(
-        bits=_args.quant_bits, dataset=_args.dataset)
+        bits=_args.quant_bits, dataset=','.join(_args.dataset))
     _origin_get_dataset = quantizer.get_dataset
     quantizer.get_dataset = _get_dataset
     logger.info('Start quantizing the model...')
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 88d178cc3d..b6c86e91f7 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -107,10 +107,13 @@ def merge_lora(args: InferArguments,
     model_kwargs = {}
     if is_torch_npu_available():
         logger.info(f'device_count: {torch.npu.device_count()}')
-        device_map = 'npu:0'
+        if device_map is None:
+            device_map = 'npu:0'
     else:
         logger.info(f'device_count: {torch.cuda.device_count()}')
-        device_map = 'auto'
+        if device_map is None:
+            device_map = 'auto'
+    if device_map == 'auto':
         model_kwargs['low_cpu_mem_usage'] = True
     model_kwargs['device_map'] = device_map
 
@@ -150,10 +153,13 @@ def prepare_model_template(
     model_kwargs = {}
     if is_torch_npu_available():
         logger.info(f'device_count: {torch.npu.device_count()}')
-        device_map = 'npu:0'
+        if device_map is None:
+            device_map = 'npu:0'
     else:
         logger.info(f'device_count: {torch.cuda.device_count()}')
-        device_map = 'auto'
+        if device_map is None:
+            device_map = 'auto'
+    if device_map == 'auto':
         model_kwargs['low_cpu_mem_usage'] = True
     model_kwargs['device_map'] = device_map
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 0b5c2cb9a4..bceba5a4d9 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -1214,6 +1214,13 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True)
+@register_model(
+    ModelType.yi_9b_200k,
+    '01ai/Yi-9B-200K',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.yi_6b,
     '01ai/Yi-6B',
@@ -1452,6 +1459,15 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'])
+@register_model(
+    ModelType.qwen1half_32b_chat_awq,
+    'qwen/Qwen1.5-32B-Chat-AWQ',
+    LoRATM.qwen1half,
+    TemplateType.qwen,
+    support_flash_attn=True,
+    support_vllm=True,
+    function_kwargs={'is_awq': True},
+    requires=['transformers>=4.37', 'autoawq'])
 @register_model(
     ModelType.qwen1half_72b_chat_awq,
     'qwen/Qwen1.5-72B-Chat-AWQ',
@@ -2299,6 +2315,15 @@ def get_model_tokenizer_qwen_vl(model_dir: str,
         model_kwargs['quantization_config'].llm_int8_skip_modules = [
             'lm_head', 'attn_pool.attn'
         ]
+        _TransformerBlock = get_class_from_dynamic_module(
+            'visual.TransformerBlock', model_dir)
+
+        def _get_cast_dtype(self) -> torch.dtype:
+            return self.resblocks[0].ln_1.weight.dtype
+
+        _TransformerBlock.__old_get_cast_dtype = _TransformerBlock.get_cast_dtype
+        _TransformerBlock.get_cast_dtype = _get_cast_dtype
+
     get_qwen_function = kwargs.pop('get_qwen_function',
                                    get_model_tokenizer_qwen_chat)
     tokenizer_config = get_tokenizer_config(model_dir)
@@ -2911,6 +2936,16 @@ def _new_generate(inputs=None, *args, **kwargs):
     model.generate = _new_generate
 
 
+@register_model(
+    ModelType.llava1d6_yi_34b_instruct,
+    'AI-ModelScope/llava-v1.6-34b',
+    LoRATM.llama2,
+    TemplateType.llava_yi_instruct,
+    eos_token='<|im_end|>',
+    support_flash_attn=True,
+    function_kwargs={'llm_model_type': 'llama'},
+    tags=['multi-modal', 'vision'],
+    hf_model_id='liuhaotian/llava-v1.6-34b')
 @register_model(
     ModelType.llava1d6_mistral_7b_instruct,
     'AI-ModelScope/llava-v1.6-mistral-7b',
@@ -2918,7 +2953,9 @@ def _new_generate(inputs=None, *args, **kwargs):
     TemplateType.llava_mistral_instruct,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
+    function_kwargs={'llm_model_type': 'mistral'},
+    tags=['multi-modal', 'vision'],
+    hf_model_id='liuhaotian/llava-v1.6-mistral-7b')
 def get_model_tokenizer_llava(model_dir: str,
                               torch_dtype: Dtype,
                               model_kwargs: Dict[str, Any],
@@ -2928,76 +2965,38 @@ def get_model_tokenizer_llava(model_dir: str,
         'https://github.com/haotian-liu/LLaVA.git')
     sys.path.append(os.path.join(local_repo_path))
 
-    from llava.model import LlavaMistralForCausalLM, LlavaMistralConfig
-    model_config = LlavaMistralConfig.from_pretrained(model_dir)
+    llm_model_type = kwargs.pop('llm_model_type')
+    if llm_model_type == 'mistral':
+        from llava.model import LlavaMistralForCausalLM, LlavaMistralConfig
+        model_config = LlavaMistralConfig.from_pretrained(model_dir)
+        automodel_class = LlavaMistralForCausalLM
+    else:  # llama
+        from llava.model import LlavaLlamaForCausalLM, LlavaConfig
+        if not hasattr(LlavaLlamaForCausalLM,
+                       '__old_forward'):  # Avoid double patching
+            forward = LlavaLlamaForCausalLM.forward
+            LlavaLlamaForCausalLM.__old_forward = forward
+
+            @wraps(forward)
+            def _new_forward(*args, **kwargs):
+                kwargs.pop('cache_position', None)
+                return forward(*args, **kwargs)
+
+            LlavaLlamaForCausalLM.forward = _new_forward
+        model_config = LlavaConfig.from_pretrained(model_dir)
+        automodel_class = LlavaLlamaForCausalLM
     model_config.mm_vision_tower = snapshot_download(
         'AI-ModelScope/clip-vit-large-patch14-336')
     model, tokenizer = get_model_tokenizer_with_flash_attn(
-        model_dir,
-        torch_dtype,
+	@@ -2938,7 +3242,7 @@ def get_model_tokenizer_llava(model_dir: str,
         model_kwargs,
         load_model,
         model_config=model_config,
-        automodel_class=LlavaMistralForCausalLM,
+        automodel_class=automodel_class,
         **kwargs)
 
     model.resize_token_embeddings(len(tokenizer))
-    vision_tower = model.get_vision_tower()
-    device_map = str(model_kwargs.get('device_map', str(model.device)))
-    if not vision_tower.is_loaded:
-        vision_tower.load_model(device_map=device_map)
-    if not hasattr(model.config, 'max_sequence_length'):
-        model.config.max_sequence_length = 2048
-    _patch_llava(model)
-    return model, tokenizer
-
-
-@register_model(
-    ModelType.llava1d6_yi_34b_instruct,
-    'AI-ModelScope/llava-v1.6-34b',
-    LoRATM.llama2,
-    TemplateType.llava_yi_instruct,
-    eos_token='<|im_end|>',
-    support_flash_attn=True,
-    tags=['multi-modal', 'vision'])
-def get_model_tokenizer_llava_34b(model_dir: str,
-                                  torch_dtype: Dtype,
-                                  model_kwargs: Dict[str, Any],
-                                  load_model: bool = True,
-                                  **kwargs):
-    local_repo_path = _git_clone_github(
-        'https://github.com/haotian-liu/LLaVA.git')
-    sys.path.append(os.path.join(local_repo_path))
-
-    from llava.model import LlavaLlamaForCausalLM, LlavaConfig
-    forward = LlavaLlamaForCausalLM.forward
-    LlavaLlamaForCausalLM.__old_forward = forward
-
-    @wraps(forward)
-    def _new_forward(*args, **kwargs):
-        kwargs.pop('cache_position', None)
-        return forward(*args, **kwargs)
-
-    LlavaLlamaForCausalLM.forward = _new_forward
-    model_config = LlavaConfig.from_pretrained(model_dir)
-    model_config.mm_vision_tower = snapshot_download(
-        'AI-ModelScope/clip-vit-large-patch14-336')
-    model, tokenizer = get_model_tokenizer_with_flash_attn(
-        model_dir,
-        torch_dtype,
-        model_kwargs,
-        load_model,
-        model_config=model_config,
-        automodel_class=LlavaLlamaForCausalLM,
-        **kwargs)
-    model.resize_token_embeddings(len(tokenizer))
-    vision_tower = model.get_vision_tower()
-    device_map = str(model_kwargs.get('device_map', str(model.device)))
-    if not vision_tower.is_loaded:
-        vision_tower.load_model(device_map=device_map)
-    if not hasattr(model.config, 'max_sequence_length'):
-        model.config.max_sequence_length = 2048
-    _patch_llava(model)
+	@@ -2952,55 +3256,6 @@ def get_model_tokenizer_llava(model_dir: str,
     return model, tokenizer
 
 

From 48f587c199aec37303f84753a9758be7c7536b5f Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 18:42:50 +0800
Subject: [PATCH 06/26] fix bug

---
 swift/llm/utils/model.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index bceba5a4d9..9d696bb558 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -2988,7 +2988,8 @@ def _new_forward(*args, **kwargs):
     model_config.mm_vision_tower = snapshot_download(
         'AI-ModelScope/clip-vit-large-patch14-336')
     model, tokenizer = get_model_tokenizer_with_flash_attn(
-	@@ -2938,7 +3242,7 @@ def get_model_tokenizer_llava(model_dir: str,
+        model_dir,
+        torch_dtype,
         model_kwargs,
         load_model,
         model_config=model_config,
@@ -2996,7 +2997,13 @@ def _new_forward(*args, **kwargs):
         **kwargs)
 
     model.resize_token_embeddings(len(tokenizer))
-	@@ -2952,55 +3256,6 @@ def get_model_tokenizer_llava(model_dir: str,
+    vision_tower = model.get_vision_tower()
+    device_map = str(model_kwargs.get('device_map', str(model.device)))
+    if not vision_tower.is_loaded:
+        vision_tower.load_model(device_map=device_map)
+    if not hasattr(model.config, 'max_sequence_length'):
+        model.config.max_sequence_length = 2048
+    _patch_llava(model)
     return model, tokenizer
 
 

From 4a68e761cab7ca2d282e055f2448fa0398e23ecf Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 18:48:26 +0800
Subject: [PATCH 07/26] fix bug

---
 swift/llm/utils/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 9d696bb558..805586b6dc 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -95,6 +95,7 @@ class ModelType:
     qwen1half_4b_chat_awq = 'qwen1half-4b-chat-awq'
     qwen1half_7b_chat_awq = 'qwen1half-7b-chat-awq'
     qwen1half_14b_chat_awq = 'qwen1half-14b-chat-awq'
+    qwen1half_32b_chat_awq = 'qwen1half-32b-chat-awq'
     qwen1half_72b_chat_awq = 'qwen1half-72b-chat-awq'
 
     # qwen-vl
@@ -127,6 +128,7 @@ class ModelType:
     yi_6b_200k = 'yi-6b-200k'
     yi_6b_chat = 'yi-6b-chat'
     yi_9b = 'yi-9b'
+    yi_9b_200k = 'yi-9b-200k'
     yi_34b = 'yi-34b'
     yi_34b_200k = 'yi-34b-200k'
     yi_34b_chat = 'yi-34b-chat'

From f51d954f05aec40aa6828046254e676f98db58f6 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 18:50:42 +0800
Subject: [PATCH 08/26] update

---
 swift/llm/utils/model.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index bd48f18b24..6d4c86842b 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -1343,13 +1343,6 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='01-ai/Yi-9B-200K')
-@register_model(
-    ModelType.yi_9b_200k,
-    '01ai/Yi-9B-200K',
-    LoRATM.llama2,
-    TemplateType.default_generation,
-    support_flash_attn=True,
-    support_vllm=True)
 @register_model(
     ModelType.yi_6b,
     '01ai/Yi-6B',
@@ -1624,15 +1617,6 @@ def get_model_tokenizer_aqlm(model_dir: str,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
     hf_model_id='Qwen/Qwen1.5-32B-Chat-AWQ')
-@register_model(
-    ModelType.qwen1half_32b_chat_awq,
-    'qwen/Qwen1.5-32B-Chat-AWQ',
-    LoRATM.qwen1half,
-    TemplateType.qwen,
-    support_flash_attn=True,
-    support_vllm=True,
-    function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
 @register_model(
     ModelType.qwen1half_72b_chat_awq,
     'qwen/Qwen1.5-72B-Chat-AWQ',

From 4fc4aae270a6cc792320c1ae6edb44fdc766171d Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 19:00:26 +0800
Subject: [PATCH 09/26] fix

---
 swift/llm/utils/model.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index bd48f18b24..6d4c86842b 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -1343,13 +1343,6 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='01-ai/Yi-9B-200K')
-@register_model(
-    ModelType.yi_9b_200k,
-    '01ai/Yi-9B-200K',
-    LoRATM.llama2,
-    TemplateType.default_generation,
-    support_flash_attn=True,
-    support_vllm=True)
 @register_model(
     ModelType.yi_6b,
     '01ai/Yi-6B',
@@ -1624,15 +1617,6 @@ def get_model_tokenizer_aqlm(model_dir: str,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
     hf_model_id='Qwen/Qwen1.5-32B-Chat-AWQ')
-@register_model(
-    ModelType.qwen1half_32b_chat_awq,
-    'qwen/Qwen1.5-32B-Chat-AWQ',
-    LoRATM.qwen1half,
-    TemplateType.qwen,
-    support_flash_attn=True,
-    support_vllm=True,
-    function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
 @register_model(
     ModelType.qwen1half_72b_chat_awq,
     'qwen/Qwen1.5-72B-Chat-AWQ',

From 36adefdc2a29501cb08cdd28725b53a10c7ff72a Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 16 Apr 2024 21:32:36 +0800
Subject: [PATCH 10/26] update revision

---
 swift/llm/dpo.py              |  2 ++
 swift/llm/export.py           |  1 +
 swift/llm/infer.py            |  4 +++-
 swift/llm/sft.py              |  1 +
 swift/llm/utils/argument.py   | 10 ++++++----
 swift/llm/utils/model.py      | 21 ++++++++++++++++-----
 swift/llm/utils/utils.py      | 31 +++++++++++++++++--------------
 swift/llm/utils/vllm_utils.py |  4 +++-
 8 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py
index 58e6de18cc..56718f18b3 100644
--- a/swift/llm/dpo.py
+++ b/swift/llm/dpo.py
@@ -55,6 +55,7 @@ def llm_dpo(args: DPOArguments) -> str:
         args.torch_dtype,
         model_kwargs,
         model_id_or_path=args.model_id_or_path,
+        revision=args.model_revision,
         **kwargs)
     if args.ref_model_type is not None:
         ref_model, _ = get_model_tokenizer(
@@ -62,6 +63,7 @@ def llm_dpo(args: DPOArguments) -> str:
             args.torch_dtype,
             model_kwargs,
             model_id_or_path=args.ref_model_id_or_path,
+            revision=args.model_revision,
             **kwargs)
     else:
         ref_model = None
diff --git a/swift/llm/export.py b/swift/llm/export.py
index 0fe892b8e7..3ecc282f08 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -38,6 +38,7 @@ def prepare_awq_model_template(
         args.torch_dtype,
         model_kwargs,
         model_id_or_path=model_id_or_path,
+        revision=args.model_revision,
         automodel_class=AutoAWQForCausalLM)
     logger.info(f'model_config: {model.config}')
     generation_config = GenerationConfig(
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index b6c86e91f7..e34177a2ad 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -125,7 +125,8 @@ def merge_lora(args: InferArguments,
         args.model_type,
         args.torch_dtype,
         model_kwargs,
-        model_id_or_path=model_id_or_path)
+        model_id_or_path=model_id_or_path,
+        revision=args.model_revision)
     logger.info(f'model_config: {model.config}')
 
     # Preparing LoRA
@@ -189,6 +190,7 @@ def prepare_model_template(
         args.torch_dtype,
         model_kwargs,
         model_id_or_path=model_id_or_path,
+        revision=args.model_revision,
         **kwargs)
     logger.info(f'model_config: {model.config}')
     if model.max_model_len is None:
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
index e69bd7cfd8..67d8ecfaac 100644
--- a/swift/llm/sft.py
+++ b/swift/llm/sft.py
@@ -78,6 +78,7 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         args.torch_dtype,
         model_kwargs,
         model_id_or_path=args.model_id_or_path,
+        revision=args.model_revision,
         is_training=True,
         **kwargs)
     logger.info(f'model_config: {model.config}')
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 69dbd3d327..3447abce36 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -14,7 +14,7 @@
 from torch import dtype as Dtype
 from transformers.utils import (is_torch_bf16_gpu_available,
                                 is_torch_cuda_available,
-                                is_torch_npu_available)
+                                is_torch_npu_available, strtobool)
 from transformers.utils.versions import require_version
 
 from swift.hub import HubApi, ModelScopeConfig
@@ -1044,11 +1044,13 @@ def set_model_type(args: Union[SftArguments, InferArguments]) -> None:
         raise ValueError(f"model_type: '{args.model_type}' is not registered. "
                          + error_msg)
     model_info = MODEL_MAPPING[args.model_type]
-    if args.model_revision is None:
-        args.model_revision = model_info['revision']
-    else:
+    use_hf = strtobool(os.environ.get('USE_HF', 'False'))
+    if args.model_revision is not None:
         model_info['revision'] = args.model_revision
         logger.info(f"Setting model_info['revision']: {args.model_revision}")
+    elif use_hf:
+        model_info['revision'] = 'main'
+    args.model_revision = model_info['revision']
     if args.model_id_or_path is None:
         args.model_id_or_path = model_info['model_id_or_path']
     requires = model_info['requires']
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 6d4c86842b..07e56f3437 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -359,7 +359,7 @@ def register_model(
     requires: Optional[List[str]] = None,
     torch_dtype: Optional[Dtype] = None,
     hf_model_id: Optional[str] = None,
-    revision: Optional[str] = None,
+    revision: Optional[str] = None,  # only modelscope
     ignore_file_pattern: Optional[List[str]] = None,
     function_kwargs: Optional[Dict[str, Any]] = None,
     exists_ok: bool = False,
@@ -3352,26 +3352,34 @@ def fix_gradient_checkpointing_warning() -> None:
 
 def safe_snapshot_download(model_type: str,
                            model_id_or_path: Optional[str] = None,
+                           revision: Optional[str] = None,
                            **kwargs) -> str:
     # Perform snapshot_download (ms or hf) based on model_type and model_id_or_path.
     model_info = MODEL_MAPPING[model_type]
+    use_hf = strtobool(os.environ.get('USE_HF', 'False'))
     if model_id_or_path is None:
         model_dir = kwargs.pop('model_dir', None)  # compat with swift<1.7
         if model_dir is not None:
             model_id_or_path = model_dir
         else:
-            model_id_or_path = model_info['model_id_or_path']
+            model_id_or_path = model_info[
+                'hf_model_id' if use_hf else 'model_id_or_path']
 
     if is_dist() and not is_local_master():
         dist.barrier()
     if model_id_or_path is not None and not os.path.exists(model_id_or_path):
-        revision = model_info['revision']
-        use_hf = strtobool(os.environ.get('USE_HF', 'False'))
         ignore_file_pattern = model_info['ignore_file_pattern']
         if use_hf:
+            if revision is None:
+                revision = 'main'
             logger.info(
                 f'Downloading the model from HuggingFace Hub, model_id: {model_id_or_path}'
             )
+            use_hf_transfer = strtobool(
+                os.environ.get('USE_HF_TRANSFER', 'False'))
+            if use_hf_transfer:
+                import huggingface_hub._snapshot_download as hf_s
+                hf_s.HF_HUB_ENABLE_HF_TRANSFER = True
             from huggingface_hub import snapshot_download as hf_snapshot_download
             model_dir = hf_snapshot_download(
                 model_id_or_path,
@@ -3379,6 +3387,8 @@ def safe_snapshot_download(model_type: str,
                 revision=revision,
                 ignore_patterns=ignore_file_pattern)
         else:
+            if revision is None:
+                revision = model_info['revision']
             logger.info(
                 f'Downloading the model from ModelScope Hub, model_id: {model_id_or_path}'
             )
@@ -3414,6 +3424,7 @@ def get_model_tokenizer(
         load_model: bool = True,
         *,
         model_id_or_path: Optional[str] = None,
+        revision: Optional[str] = None,
         **kwargs) -> Tuple[Optional[PreTrainedModel], PreTrainedTokenizerBase]:
     """
     torch_dtype: If you use None, it will retrieve the torch_dtype from the config.json file.
@@ -3421,7 +3432,7 @@ def get_model_tokenizer(
     """
     model_dir = kwargs.pop('model_dir', None)  # compat with swift<1.7
     model_dir = safe_snapshot_download(
-        model_type, model_id_or_path, model_dir=model_dir)
+        model_type, model_id_or_path, revision=revision, model_dir=model_dir)
 
     model_info = MODEL_MAPPING[model_type]
     requires = model_info['requires']
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index b10eeebacf..f036b1760d 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -25,7 +25,6 @@
 from accelerate.utils.modeling import (get_balanced_memory,
                                        infer_auto_device_map)
 from datasets import Dataset as HfDataset
-from modelscope import MsDataset
 from modelscope.utils.config_ds import MS_CACHE_HOME
 from modelscope.utils.logger import get_logger as get_ms_logger
 from torch import device as Device
@@ -37,6 +36,7 @@
                           PreTrainedTokenizerBase, StoppingCriteriaList,
                           TextStreamer, trainer)
 from transformers.generation.streamers import BaseStreamer
+from transformers.utils import strtobool
 
 from swift.hub import ModelScopeConfig
 from swift.tuners.module_mapping import MODEL_KEYS_MAPPING
@@ -92,20 +92,25 @@ def download_dataset(model_id: str,
     return local_dir
 
 
-_old_msdataset_load = MsDataset.load
+use_hf = strtobool(os.environ.get('USE_HF', 'False'))
+if not use_hf:
+    from modelscope import MsDataset
+    _old_msdataset_load = MsDataset.load
 
+    @wraps(_old_msdataset_load)
+    def _msdataset_ddp_load(*args, **kwargs):
+        if is_dist() and not is_local_master():
+            dist.barrier()
+        dataset = _old_msdataset_load(*args, **kwargs)
+        if is_dist() and is_local_master():
+            dist.barrier()
 
-@wraps(_old_msdataset_load)
-def _msdataset_ddp_load(*args, **kwargs):
-    if is_dist() and not is_local_master():
-        dist.barrier()
-    dataset = _old_msdataset_load(*args, **kwargs)
-    if is_dist() and is_local_master():
-        dist.barrier()
+        if is_dist():
+            dist.barrier()
+        return dataset
 
-    if is_dist():
-        dist.barrier()
-    return dataset
+    # monkey patching
+    MsDataset.load = _msdataset_ddp_load
 
 
 def _get_max_memory(device_ids: List[int]) -> Dict[Union[int, str], int]:
@@ -899,8 +904,6 @@ def get_max_model_len(config: PretrainedConfig) -> Optional[int]:
     return max_model_len
 
 
-# monkey patching
-MsDataset.load = _msdataset_ddp_load
 if is_ddp_plus_mp():
     _old_ddp_init = DDP.__init__
     accelerate.accelerator.torch.nn.parallel.DistributedDataParallel.__init__ = (
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
index 16aed7d3df..4e8b90e7b1 100644
--- a/swift/llm/utils/vllm_utils.py
+++ b/swift/llm/utils/vllm_utils.py
@@ -33,6 +33,7 @@ def get_vllm_engine(model_type: str,
                     torch_dtype: Optional[Dtype] = None,
                     *,
                     model_id_or_path: Optional[str] = None,
+                    revision: Optional[str] = None,
                     gpu_memory_utilization: float = 0.9,
                     tensor_parallel_size: int = 1,
                     max_model_len: Optional[int] = None,
@@ -47,7 +48,8 @@ def get_vllm_engine(model_type: str,
         model_type,
         load_model=False,
         model_id_or_path=model_id_or_path,
-        model_dir=model_dir)[1]
+        model_dir=model_dir,
+        revision=revision)[1]
     model_dir = tokenizer.model_dir
 
     if engine_kwargs is None:

From 2c812268749676afc67f44501ae83b960596ffa1 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 17 Apr 2024 16:22:09 +0800
Subject: [PATCH 11/26] update dataset

---
 ...56\350\260\203\346\226\207\346\241\243.md" |   4 +
 swift/llm/export.py                           |   2 +-
 swift/llm/utils/dataset.py                    | 130 +++++++++++++-----
 3 files changed, 99 insertions(+), 37 deletions(-)

diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index 8185ed0de4..2d38e22f60 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -7,6 +7,7 @@
 - [量化](#量化)
 - [推理](#推理)
 - [Web-UI](#web-ui)
+- [推送模型](#推送模型)
 
 ## 环境准备
 GPU设备: A10, 3090, V100, A100均可.
@@ -287,3 +288,6 @@ CUDA_VISIBLE_DEVICES=0 swift export \
 
 CUDA_VISIBLE_DEVICES=0 swift app-ui --ckpt_dir 'xxx/vx-xxx/checkpoint-xxx-merged'
 ```
+
+## 推送模型
+如果你想推送模型到ModelScope，可以参考[模型推送文档](LLM量化文档.md#推送模型)
diff --git a/swift/llm/export.py b/swift/llm/export.py
index 3ecc282f08..f4fb58e8e0 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -181,7 +181,7 @@ def llm_export(args: ExportArguments) -> None:
                 ckpt_dir,
                 f'{ckpt_name}-{args.quant_method}-int{args.quant_bits}')
         logger.info(f'Setting quant_path: {quant_path}')
-        assert not os.path.exists(quant_path)
+        assert not os.path.exists(quant_path), f'quant_path: {quant_path}'
         if args.quant_method == 'awq':
             awq_model, template = prepare_awq_model_template(args)
             awq_model_quantize(awq_model, template.tokenizer)
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 7ac363aaa0..fcfde43c68 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -9,11 +9,11 @@
 import numpy as np
 import pandas as pd
 from datasets import Dataset as HfDataset
-from datasets import concatenate_datasets
-from modelscope import MsDataset
+from datasets import concatenate_datasets, load_dataset
 from numpy.random import RandomState
 from pandas import DataFrame
 from tqdm.auto import tqdm
+from transformers.utils import strtobool
 
 from swift.utils import (get_logger, get_seed, read_from_jsonl,
                          transform_jsonl_to_df)
@@ -179,6 +179,7 @@ def register_dataset(
         preprocess_func: Optional[PreprocessFunc] = None,
         get_function: Optional[GetDatasetFunction] = None,
         *,
+        hf_dataset_id: Optional[str] = None,
         function_kwargs: Optional[Dict[str, Any]] = None,
         exists_ok: bool = False,
         **kwargs
@@ -201,6 +202,7 @@ def register_dataset(
         'train_subset_split_list': train_subset_split_list,
         'val_subset_split_list': val_subset_split_list,
         'preprocess_func': preprocess_func,
+        'hf_dataset_id': hf_dataset_id,
         **kwargs
     }
     if get_function is not None:
@@ -225,6 +227,7 @@ def _register_dataset(
 def load_ms_dataset(
         dataset_id: str,
         subset_split_list: Optional[List[SubsetSplit]]) -> Optional[HfDataset]:
+    from modelscope import MsDataset
     if subset_split_list is None or len(subset_split_list) == 0:
         return None
     dataset_list = []
@@ -241,14 +244,34 @@ def load_ms_dataset(
     return concatenate_datasets(dataset_list)
 
 
+def load_hf_dataset(
+        dataset_id: str,
+        subset_split_list: Optional[List[SubsetSplit]]) -> Optional[HfDataset]:
+    if subset_split_list is None or len(subset_split_list) == 0:
+        return None
+    dataset_list = []
+    for subset_split in subset_split_list:
+        if isinstance(subset_split, str):
+            subset_split = (None, subset_split)
+        assert len(subset_split) == 2
+        subset_name, split = subset_split
+        dataset = load_dataset(
+            dataset_id, name=subset_name, split=split)
+        dataset_list.append(dataset)
+    return concatenate_datasets(dataset_list)
+
+
 @register_dataset(
     DatasetName.text2sql_en,
     'AI-ModelScope/texttosqlv2_25000_v2', ['train'],
-    tags=['chat', 'sql'])
+    tags=['chat', 'sql'],
+    hf_dataset_id='Clinton/texttosqlv2_25000_v2')
 @register_dataset(
     DatasetName.school_math_zh,
-    'AI-ModelScope/school_math_0.25M', ['train'],
-    tags=['chat', 'math'])
+    'AI-ModelScope/school_math_0.25M',
+    ['train'],
+    tags=['chat', 'math'],
+    hf_dataset_id='BelleGroup/school_math_0.25M')
 @register_dataset(
     DatasetName.gpt4all_en,
     'wyj123456/GPT4all', ['train'],
@@ -268,15 +291,18 @@ def load_ms_dataset(
 @register_dataset(
     DatasetName.code_alpaca_en,
     'wyj123456/code_alpaca_en', ['train'],
-    tags=['chat', 'coding'])
+    tags=['chat', 'coding'],
+    hf_dataset_id='sahil2801/CodeAlpaca-20k')
 @register_dataset(
     DatasetName.finance_en,
     'wyj123456/finance_en', ['train'],
-    tags=['chat', 'financial'])
+    tags=['chat', 'financial'],
+    hf_dataset_id='ssbuild/alpaca_finance_en')
 @register_dataset(
     DatasetName.alpaca_en,
     'AI-ModelScope/alpaca-gpt4-data-en', ['train'],
-    tags=['chat', 'general', '🔥'])
+    tags=['chat', 'general', '🔥'],
+    hf_dataset_id='vicgalle/alpaca-gpt4')
 @register_dataset(
     DatasetName.coig_cqia_chinese_traditional,
     'AI-ModelScope/COIG-CQIA', [('chinese_traditional', 'train')],
@@ -344,12 +370,16 @@ def get_dataset_from_repo(
         preprocess_func: PreprocessFunc,
         remove_useless_columns: bool = True,
         train_dataset_sample: int = -1,
-        val_dataset_sample: int = -1) -> Tuple[HfDataset, Optional[HfDataset]]:
+        val_dataset_sample: int = -1,
+        use_hf: bool = False) -> Tuple[HfDataset, Optional[HfDataset]]:
     dataset_list = []
     _iter = zip([train_subset_split_list, val_subset_split_list],
                 [train_dataset_sample, val_dataset_sample])
     for subset_split_list, dataset_sample in _iter:
-        dataset = load_ms_dataset(dataset_id, subset_split_list)
+        if use_hf:
+            dataset = load_hf_dataset(dataset_id, subset_split_list)
+        else:
+            dataset = load_ms_dataset(dataset_id, subset_split_list)
         if dataset is not None:
             if dataset_sample > 0 and len(dataset) > dataset_sample:
                 random_state = np.random.RandomState(42)
@@ -402,7 +432,8 @@ def _concat_inst_inp_alpaca_zh(inst: str, inp: str) -> str:
     None,
     AlpacaPreprocessor(concat_inst_inp=_concat_inst_inp_alpaca_zh),
     get_dataset_from_repo,
-    tags=['chat', 'general', '🔥'])
+    tags=['chat', 'general', '🔥'],
+    hf_dataset_id='c-s-ale/alpaca-gpt4-data-zh')
 
 
 def _preprocess_vision_dataset(dataset: HfDataset) -> HfDataset:
@@ -559,7 +590,8 @@ def map_row(row):
     'AI-ModelScope/LongAlpaca-12k', ['train'], [],
     long_alpaca_preprocessor,
     get_dataset_from_repo,
-    tags=['longlora', 'QA'])
+    tags=['longlora', 'QA'],
+    hf_dataset_id='Yukang/LongAlpaca-12k')
 
 
 def _preprocess_ruozhiba(dataset: HfDataset):
@@ -647,7 +679,8 @@ def map_row(row):
     'lvjianjin/AdvertiseGen', ['train'], ['validation'],
     TextGenerationPreprocessor(advertise_gen_prompt, 'content', 'summary'),
     get_dataset_from_repo,
-    tags=['text-generation', '🔥'])
+    tags=['text-generation', '🔥'],
+    hf_dataset_id='shibing624/AdvertiseGen')
 
 _firefly_kind_list = [
     'ProseGeneration', 'MRC', 'JinYongGeneration', 'TextCorrection',
@@ -718,7 +751,8 @@ def get_firefly_zh_dataset(dataset_id: str, preprocess_func,
     ClsPreprocessor(['neutral', 'entailment', 'contradiction'],
                     'Natural Language Inference', True),
     get_dataset_from_repo,
-    tags=['text-generation', 'classification'])
+    tags=['text-generation', 'classification'],
+    hf_dataset_id='clue')
 
 register_dataset(
     DatasetName.cmnli_mini_zh,
@@ -730,7 +764,8 @@ def get_firefly_zh_dataset(dataset_id: str, preprocess_func,
         'train_dataset_sample': 20000,
         'val_dataset_sample': 200
     },
-    tags=['text-generation', 'classification', '🔥'])
+    tags=['text-generation', 'classification', '🔥'],
+    hf_dataset_id='clue')
 
 register_dataset(
     DatasetName.jd_sentiment_zh,
@@ -1135,7 +1170,8 @@ def _preprocess_blossom_math(dataset: HfDataset) -> HfDataset:
     None,
     _preprocess_blossom_math,
     get_dataset_from_repo,
-    tags=['chat', 'math', '🔥'])
+    tags=['chat', 'math', '🔥'],
+    hf_dataset_id='Azure99/blossom-math-v2')
 
 register_dataset(
     DatasetName.sql_create_context_en,
@@ -1150,7 +1186,8 @@ def _preprocess_blossom_math(dataset: HfDataset) -> HfDataset:
         AlpacaPreprocessor(),
     ]),
     get_dataset_from_repo,
-    tags=['chat', 'sql', '🔥'])
+    tags=['chat', 'sql', '🔥'],
+    hf_dataset_id='b-mc2/sql-create-context')
 
 register_dataset(
     DatasetName.lawyer_llama_zh,
@@ -1162,7 +1199,8 @@ def _preprocess_blossom_math(dataset: HfDataset) -> HfDataset:
         'history': '_'
     }),
     get_dataset_from_repo,
-    tags=['chat', 'law'])
+    tags=['chat', 'law'],
+    hf_dataset_id='Skepsun/lawyer_llama_data')
 
 
 def _preprocess_tigerbot_law(dataset: HfDataset) -> HfDataset:
@@ -1189,7 +1227,8 @@ def _preprocess_tigerbot_law(dataset: HfDataset) -> HfDataset:
     None,
     _preprocess_tigerbot_law,
     get_dataset_from_repo,
-    tags=['text-generation', 'law', 'pretrained'])
+    tags=['text-generation', 'law', 'pretrained'],
+    hf_dataset_id='TigerResearch/tigerbot-law-plugin')
 
 
 def _preprocess_leetcode_python(dataset: HfDataset) -> HfDataset:
@@ -1282,7 +1321,8 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset:
     [[subset, 'train'] for subset in hc3_chinese_subset], [],
     _preprocess_hc3,
     get_dataset_from_repo,
-    tags=['text-generation', 'classification', '🔥'])
+    tags=['text-generation', 'classification', '🔥'],
+    hf_dataset_id='Hello-SimpleAI/HC3-Chinese')
 
 register_dataset(
     DatasetName.hc3_en,
@@ -1290,40 +1330,46 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset:
     [],
     _preprocess_hc3,
     get_dataset_from_repo,
-    tags=['text-generation', 'classification', '🔥'])
+    tags=['text-generation', 'classification', '🔥'],
+    hf_dataset_id='Hello-SimpleAI/HC3')
 
 register_dataset(
     DatasetName.tulu_v2_sft_mixture,
     'AI-ModelScope/tulu-v2-sft-mixture', ['train'], [],
     None,
     get_dataset_from_repo,
-    tags=['chat', 'multilingual', 'general', 'multi-round'])
+    tags=['chat', 'multilingual', 'general', 'multi-round'],
+    hf_dataset_id='allenai/tulu-v2-sft-mixture')
 register_dataset(
     DatasetName.webnovel_zh,
     'AI-ModelScope/webnovel_cn', ['train'], [],
     None,
     get_dataset_from_repo,
-    tags=['chat', 'novel'])
+    tags=['chat', 'novel'],
+    hf_dataset_id='zxbsmk/webnovel_cn')
 register_dataset(
     DatasetName.generated_chat_zh,
     'AI-ModelScope/generated_chat_0.4M', ['train'], [],
     None,
     get_dataset_from_repo,
-    tags=['chat', 'character-dialogue'])
+    tags=['chat', 'character-dialogue'],
+    hf_dataset_id='BelleGroup/generated_chat_0.4M')
 register_dataset(
     DatasetName.wikipedia_zh,
     'AI-ModelScope/wikipedia-cn-20230720-filtered', ['train'],
     None,
     RenameColumnsPreprocessor({'completion': 'response'}),
     get_dataset_from_repo,
-    tags=['text-generation', 'general', 'pretrained'])
+    tags=['text-generation', 'general', 'pretrained'],
+    hf_dataset_id='pleisto/wikipedia-cn-20230720-filtered')
 register_dataset(
     DatasetName.open_platypus_en,
     'AI-ModelScope/Open-Platypus', ['train'],
     None,
     None,
     get_dataset_from_repo,
-    tags=['chat', 'math'])
+    tags=['chat', 'math'],
+    hf_dataset_id='garage-bAInd/Open-Platypus')
 register_dataset(
     DatasetName.open_orca_gpt4,
     'AI-ModelScope/OpenOrca', ['train'],
@@ -1366,8 +1412,10 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset:
         value_key='content',
         error_strategy='delete'),
     get_dataset_from_repo,
-    tags=['chat', 'medical', '🔥'])
+    tags=['chat', 'medical', '🔥'],
+    hf_dataset_id='Flmc/DISC-Med-SFT')
 
+# hf_dataset_id='ShengbinYue/DISC-Law-SFT'
 register_dataset(
     DatasetName.disc_law_sft_zh,
     'AI-ModelScope/DISC-Law-SFT', ['train'],
@@ -1381,13 +1429,14 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset:
 
 register_dataset(
     DatasetName.pileval,
-    'huangjintao/pile-val-backup', ['train'],
+    'huangjintao/pile-val-backup', ['validation'],
     None,
     RenameColumnsPreprocessor({
         'text': 'response',
     }),
     get_dataset_from_repo,
-    tags=['text-generation', 'awq'])
+    tags=['text-generation', 'awq'],
+    hf_dataset_id='mit-han-lab/pile-val-backup')
 
 
 def add_self_cognition_dataset(
@@ -1494,7 +1543,7 @@ def get_dataset(
     dataset_test_ratio: float = 0.,
     dataset_seed: Union[RandomState, int] = 42,
     check_dataset_strategy: Literal['none', 'discard', 'error',
-                                    'warning'] = 'none'
+                                    'warning'] = 'none',
 ) -> Tuple[HfDataset, Optional[HfDataset]]:
     """Returns train_dataset and val_dataset"""
     if isinstance(dataset_name_list, str):
@@ -1506,12 +1555,21 @@ def get_dataset(
         random_state = RandomState(dataset_seed)
     for dataset_name in dataset_name_list:
         dataset_info = DATASET_MAPPING[dataset_name]
+        use_hf = strtobool(os.environ.get('USE_HF', 'False'))
+        if use_hf:
+            dataset_id_or_path = dataset_info['hf_dataset_id']
+        else:
+            dataset_id_or_path = dataset_info['dataset_id_or_path']
+        assert dataset_id_or_path is not None, (
+            f'dataset_name: {dataset_name}, use_hf: {use_hf}, '
+            f'dataset_id_or_path: {dataset_id_or_path}.')
         get_function: GetDatasetFunction = dataset_info['get_function']
         dataset = get_function(
-            dataset_info['dataset_id_or_path'],
+            dataset_id_or_path,
             train_subset_split_list=dataset_info['train_subset_split_list'],
             val_subset_split_list=dataset_info['val_subset_split_list'],
-            preprocess_func=dataset_info['preprocess_func'])
+            preprocess_func=dataset_info['preprocess_func'],
+            use_hf=use_hf)
         train_d: HfDataset
         if isinstance(dataset, (list, tuple)):
             train_d, val_d = dataset
@@ -1572,12 +1630,12 @@ def load_dataset_from_local(
     return concatenate_datasets(dataset_list)
 
 
-def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]],
-                       val_subset_split_list: Optional[Union[str, List[str]]],
+def get_custom_dataset(_: str, train_dataset_path_list: Union[str, List[str]],
+                       val_dataset_path_list: Optional[Union[str, List[str]]],
                        preprocess_func: PreprocessFunc,
                        **kwargs) -> Tuple[HfDataset, Optional[HfDataset]]:
-    train_dataset = load_dataset_from_local(train_subset_split_list,
+    train_dataset = load_dataset_from_local(train_dataset_path_list,
                                             preprocess_func)
-    val_dataset = load_dataset_from_local(val_subset_split_list,
+    val_dataset = load_dataset_from_local(val_dataset_path_list,
                                           preprocess_func)
     return train_dataset, val_dataset

From 45020e62d80cbac2e2bb73b96a2f00a4f0928c6a Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 17 Apr 2024 17:10:07 +0800
Subject: [PATCH 12/26] fix lint

---
 README_CN.md               | 2 +-
 swift/llm/utils/dataset.py | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/README_CN.md b/README_CN.md
index c9fd83b98b..a6ea303910 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -53,7 +53,7 @@ SWIFT支持近**200种LLM和MLLM**（多模态大模型）的训练、推理、
 - 🔥2024.04.02: 支持Mengzi3-13B-Base模型的推理与微调, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/sft.sh)来开始训练！
 - 🔥2024.04.01: 支持**dbrx**系列, dbrx-base和dbrx-instruct, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/dbrx-instruct/lora_mp/sft.sh)来开始训练！.
 - 🔥2024.03.29: 支持**Qwen1.5-MoE**系列: Qwen1.5-MoE-A2.7B, Qwen1.5-MoE-A2.7B-Chat, Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4.
-- 🔥2024.03.29: 支持**Grok-1**300B MoE模型的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM/Grok训练和推理.md).
+- 🔥2024.03.29: 支持**Grok-1** 300B MoE模型的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM/Grok训练和推理.md).
 - 🔥2024.03.25: 支持TeleChat-7b和TeleChat-12b模型的训练和推理, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/telechat_12b/lora/sft.sh)来开始训练！.
 - 🔥2024.03.20: 支持**llava**系列的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/llava最佳实践.md).
 - 🔥2024.03.12: 支持**deepseek-vl**系列推理和微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/deepseek-vl最佳实践.md).
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index fcfde43c68..9860aa7e27 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -255,8 +255,7 @@ def load_hf_dataset(
             subset_split = (None, subset_split)
         assert len(subset_split) == 2
         subset_name, split = subset_split
-        dataset = load_dataset(
-            dataset_id, name=subset_name, split=split)
+        dataset = load_dataset(dataset_id, name=subset_name, split=split)
         dataset_list.append(dataset)
     return concatenate_datasets(dataset_list)
 
@@ -268,8 +267,7 @@ def load_hf_dataset(
     hf_dataset_id='Clinton/texttosqlv2_25000_v2')
 @register_dataset(
     DatasetName.school_math_zh,
-    'AI-ModelScope/school_math_0.25M',
-    ['train'],
+    'AI-ModelScope/school_math_0.25M', ['train'],
     tags=['chat', 'math'],
     hf_dataset_id='BelleGroup/school_math_0.25M')
 @register_dataset(

From a34384e14ef130edbed4eb57b2bef6743f93b115 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 17 Apr 2024 17:31:17 +0800
Subject: [PATCH 13/26] update arguments

---
 swift/llm/utils/argument.py | 71 +++++--------------------------------
 1 file changed, 8 insertions(+), 63 deletions(-)

diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 416b71398c..bdf5498f56 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -222,61 +222,6 @@ def handle_compatibility(
             if self.deepspeed_config_path is not None:
                 self.deepspeed = self.deepspeed_config_path
 
-    def set_model_type(self: Union[SftArguments, InferArguments]) -> None:
-        # compat with swift<1.7
-        if args.model_cache_dir is not None and args.model_id_or_path is None:
-            args.model_id_or_path = args.model_cache_dir
-            args.model_cache_dir = None
-
-        if args.model_id_or_path is not None:
-            model_mapping_reversed = {
-                v['model_id_or_path'].lower(): k
-                for k, v in MODEL_MAPPING.items()
-            }
-            model_id_or_path = args.model_id_or_path
-            model_id_or_path_lower = model_id_or_path.lower()
-            if model_id_or_path_lower not in model_mapping_reversed:
-                if (isinstance(args, InferArguments)
-                        and 'checkpoint' in model_id_or_path
-                        and 'merged' not in model_id_or_path
-                        and args.ckpt_dir is None):
-                    raise ValueError(
-                        'Please use `--ckpt_dir vx-xxx/checkpoint-xxx` to use the checkpoint.'
-                    )
-                if args.model_type is None:
-                    raise ValueError(
-                        f"model_id_or_path: '{model_id_or_path}' is not registered. "
-                        'Please set `--model_type <model_type>` additionally.')
-                assert args.model_cache_dir is None
-            else:
-                model_type = model_mapping_reversed[model_id_or_path_lower]
-                assert args.model_type is None or args.model_type == model_type
-                args.model_type = model_type
-                logger.info(f'Setting args.model_type: {model_type}')
-                if args.model_cache_dir is not None:
-                    args.model_id_or_path = args.model_cache_dir
-
-        error_msg = f'The model_type you can choose: {list(MODEL_MAPPING.keys())}'
-        if args.model_type is None:
-            raise ValueError('please setting `--model_type <model_type>`. '
-                            + error_msg)
-        elif args.model_type not in MODEL_MAPPING:
-            raise ValueError(f"model_type: '{args.model_type}' is not registered. "
-                            + error_msg)
-        model_info = MODEL_MAPPING[args.model_type]
-        use_hf = strtobool(os.environ.get('USE_HF', 'False'))
-        if args.model_revision is not None:
-            model_info['revision'] = args.model_revision
-            logger.info(f"Setting model_info['revision']: {args.model_revision}")
-        elif use_hf:
-            model_info['revision'] = 'main'
-        args.model_revision = model_info['revision']
-        if args.model_id_or_path is None:
-            args.model_id_or_path = model_info['model_id_or_path']
-        requires = model_info['requires']
-        for require in requires:
-            require_version(require)
-
     def set_model_type(self: Union['SftArguments', 'InferArguments']) -> None:
         # compat with swift<1.7
         if self.model_cache_dir is not None and self.model_id_or_path is None:
@@ -316,16 +261,16 @@ def set_model_type(self: Union['SftArguments', 'InferArguments']) -> None:
             raise ValueError('please setting `--model_type <model_type>`. '
                              + error_msg)
         elif self.model_type not in MODEL_MAPPING:
-            raise ValueError(
-                f"model_type: '{self.model_type}' is not registered. "
-                + error_msg)
+            raise ValueError(f"model_type: '{self.model_type}' is not registered. "
+                             + error_msg)
         model_info = MODEL_MAPPING[self.model_type]
-        if self.model_revision is None:
-            self.model_revision = model_info['revision']
-        else:
+        use_hf = strtobool(os.environ.get('USE_HF', 'False'))
+        if self.model_revision is not None:
             model_info['revision'] = self.model_revision
-            logger.info(
-                f"Setting model_info['revision']: {self.model_revision}")
+            logger.info(f"Setting model_info['revision']: {self.model_revision}")
+        elif use_hf:
+            model_info['revision'] = 'main'
+        self.model_revision = model_info['revision']
         if self.model_id_or_path is None:
             self.model_id_or_path = model_info['model_id_or_path']
         requires = model_info['requires']

From b617284707832e4cb96e44e574c675e5fa929e71 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 17 Apr 2024 17:32:08 +0800
Subject: [PATCH 14/26] fix lint

---
 swift/llm/utils/argument.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index bdf5498f56..9fab7662bd 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -261,13 +261,15 @@ def set_model_type(self: Union['SftArguments', 'InferArguments']) -> None:
             raise ValueError('please setting `--model_type <model_type>`. '
                              + error_msg)
         elif self.model_type not in MODEL_MAPPING:
-            raise ValueError(f"model_type: '{self.model_type}' is not registered. "
-                             + error_msg)
+            raise ValueError(
+                f"model_type: '{self.model_type}' is not registered. "
+                + error_msg)
         model_info = MODEL_MAPPING[self.model_type]
         use_hf = strtobool(os.environ.get('USE_HF', 'False'))
         if self.model_revision is not None:
             model_info['revision'] = self.model_revision
-            logger.info(f"Setting model_info['revision']: {self.model_revision}")
+            logger.info(
+                f"Setting model_info['revision']: {self.model_revision}")
         elif use_hf:
             model_info['revision'] = 'main'
         self.model_revision = model_info['revision']

From 241d30b72781eaa6bc8747e733a4e72babe68578 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 17 Apr 2024 19:55:21 +0800
Subject: [PATCH 15/26] update scripts

---
 ...14\346\225\260\346\215\256\351\233\206.md" | 610 +++++++++---------
 .../LLM/Supported-models-datasets.md          | 610 +++++++++---------
 scripts/utils/run_dataset_info.py             |  18 +-
 scripts/utils/run_model_info.py               |  19 +-
 swift/llm/utils/argument.py                   |   3 +-
 swift/llm/utils/dataset.py                    |   6 +
 swift/llm/utils/model.py                      |   8 +
 7 files changed, 655 insertions(+), 619 deletions(-)

diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index c695115a76..a013d6ffeb 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -12,218 +12,220 @@
 - Support VLLM: 模型是否支持[vllm](https://github.com/vllm-project/vllm)加速推理和部署.
 - Requires: 对应模型所需的额外依赖要求.
 
-| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags |
-| ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- |
-|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|
-|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|
-|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|
-|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|
-|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|
-|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|
-|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|
-|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|
-|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|
-|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|
-|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|
-|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|
-|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|
-|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|
-|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|
-|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|
-|qwen1half-0_5b|[qwen/Qwen1.5-0.5B](https://modelscope.cn/models/qwen/Qwen1.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-1_8b|[qwen/Qwen1.5-1.8B](https://modelscope.cn/models/qwen/Qwen1.5-1.8B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-4b|[qwen/Qwen1.5-4B](https://modelscope.cn/models/qwen/Qwen1.5-4B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-32b|[qwen/Qwen1.5-32B](https://modelscope.cn/models/qwen/Qwen1.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|codeqwen1half-7b|[qwen/CodeQwen1.5-7B](https://modelscope.cn/models/qwen/CodeQwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-32b-chat|[qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|codeqwen1half-7b-chat|[qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-32b-chat-int4|[qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-7b-chat-awq|[qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||multi-modal, vision|
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||multi-modal, vision|
-|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|multi-modal, vision|
-|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||multi-modal, audio|
-|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||multi-modal, audio|
-|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||-|
-|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||-|
-|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;||-|
-|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||-|
-|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||-|
-|codegeex2-6b|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;|transformers<4.34|coding|
-|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|
-|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|
-|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|
-|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|
-|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|
-|llava1d6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|&#x2714;|&#x2718;||multi-modal, vision|
-|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||-|
-|yi-9b|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||-|
-|yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|
-|yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|
-|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||-|
-|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b-v1_1](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|
-|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|
-|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||-|
-|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|
-|internlm2-1_8b|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-1_8b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-1_8b-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-7b-base|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-7b|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-7b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-7b-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-20b-base|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-20b|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-20b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-20b-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-math-7b|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||math|
-|internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||math|
-|internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||math|
-|internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||math|
-|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|&#x2714;|&#x2718;||multi-modal, vision|
-|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|
-|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|
-|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|
-|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|
-|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|
-|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|
-|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|
-|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|
-|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|
-|deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||math|
-|deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|
-|deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|
-|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||multi-modal, vision|
-|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||multi-modal, vision|
-|gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.38|-|
-|gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.38|-|
-|gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|
-|gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|
-|minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|
-|minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|
-|minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|
-|minicpm-2b-128k|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k/summary)|q_proj, k_proj, v_proj|chatml|&#x2714;|&#x2714;|transformers>=4.36.0|-|
-|minicpm-moe-8x2b|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|
-|minicpm-v-3b-chat|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V/summary)|q_proj, k_proj, v_proj|minicpm-v|&#x2714;|&#x2718;||-|
-|minicpm-v-v2|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|q_proj, k_proj, v_proj|minicpm-v|&#x2714;|&#x2718;||-|
-|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
-|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
-|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
-|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|-|
-|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|-|
-|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
-|openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.36|-|
-|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|
-|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|
-|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|
-|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|
-|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|
-|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.36|-|
-|mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|
-|mixtral-moe-8x22b-v1|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|
-|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|-|
-|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|-|
-|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;|transformers<4.34|-|
-|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||-|
-|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||-|
-|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|-|
-|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||-|
-|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||-|
-|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|-|
-|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|transformers<4.35, icecream|-|
-|mplug-owl2d1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|transformers<4.35, icecream|-|
-|yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|
-|yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|
-|yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|
-|yuan2-102b-instruct|[YuanLLM/Yuan2.0-102B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-102B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|
-|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|
-|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|
-|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-65b-v2|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-65b-chat|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|
-|xverse-13b-256k|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-moe-a4_2b|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|orion-14b|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2718;||-|
-|orion-14b-chat|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)|q_proj, k_proj, v_proj|orion|&#x2714;|&#x2718;||-|
-|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|
-|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|
-|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||-|
-|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||-|
-|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|&#x2714;|&#x2714;||-|
-|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|
-|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|&#x2718;|&#x2718;||-|
-|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|&#x2714;|&#x2714;|transformers>=4.34|-|
-|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|&#x2718;|&#x2718;||-|
-|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|&#x2718;|&#x2714;||-|
-|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|&#x2714;|&#x2714;||-|
-|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||financial|
-|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||financial|
-|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|financial|
-|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|&#x2714;|&#x2714;||coding|
-|codefuse-codegeex2-6b-chat|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B/summary)|query_key_value|codefuse|&#x2718;|&#x2714;|transformers<4.34|coding|
-|codefuse-qwen-14b-chat|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B/summary)|c_attn|codefuse|&#x2714;|&#x2714;||coding|
-|phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|&#x2714;|&#x2714;||coding|
-|cogvlm-17b-instruct|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense|cogvlm-instruct|&#x2718;|&#x2718;||multi-modal, vision|
-|cogagent-18b-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-chat|&#x2718;|&#x2718;||multi-modal, vision|
-|cogagent-18b-instruct|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-instruct|&#x2718;|&#x2718;||multi-modal, vision|
-|mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|
-|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|
-|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|
-|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|
-|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|&#x2714;|&#x2714;||-|
-|c4ai-command-r-v01|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01/summary)|q_proj, k_proj, v_proj|c4ai|&#x2714;|&#x2718;|transformers>=4.39.1|-|
-|c4ai-command-r-plus|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus/summary)|q_proj, k_proj, v_proj|c4ai|&#x2714;|&#x2718;|transformers>4.39|-|
+| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |
+| ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- |
+|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)|
+|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|[Qwen/Qwen-1_8B-Chat](https://huggingface.co/Qwen/Qwen-1_8B-Chat)|
+|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|[Qwen/Qwen-1_8B-Chat-Int4](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int4)|
+|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|[Qwen/Qwen-1_8B-Chat-Int8](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int8)|
+|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|[Qwen/Qwen-7B](https://huggingface.co/Qwen/Qwen-7B)|
+|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|[Qwen/Qwen-7B-Chat](https://huggingface.co/Qwen/Qwen-7B-Chat)|
+|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|[Qwen/Qwen-7B-Chat-Int4](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)|
+|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|[Qwen/Qwen-7B-Chat-Int8](https://huggingface.co/Qwen/Qwen-7B-Chat-Int8)|
+|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|[Qwen/Qwen-14B](https://huggingface.co/Qwen/Qwen-14B)|
+|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|[Qwen/Qwen-14B-Chat](https://huggingface.co/Qwen/Qwen-14B-Chat)|
+|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|[Qwen/Qwen-14B-Chat-Int4](https://huggingface.co/Qwen/Qwen-14B-Chat-Int4)|
+|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|[Qwen/Qwen-14B-Chat-Int8](https://huggingface.co/Qwen/Qwen-14B-Chat-Int8)|
+|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|[Qwen/Qwen-72B](https://huggingface.co/Qwen/Qwen-72B)|
+|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|[Qwen/Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat)|
+|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|[Qwen/Qwen-72B-Chat-Int4](https://huggingface.co/Qwen/Qwen-72B-Chat-Int4)|
+|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|[Qwen/Qwen-72B-Chat-Int8](https://huggingface.co/Qwen/Qwen-72B-Chat-Int8)|
+|qwen1half-0_5b|[qwen/Qwen1.5-0.5B](https://modelscope.cn/models/qwen/Qwen1.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-0.5B](https://huggingface.co/Qwen/Qwen1.5-0.5B)|
+|qwen1half-1_8b|[qwen/Qwen1.5-1.8B](https://modelscope.cn/models/qwen/Qwen1.5-1.8B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)|
+|qwen1half-4b|[qwen/Qwen1.5-4B](https://modelscope.cn/models/qwen/Qwen1.5-4B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B)|
+|qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-7B](https://huggingface.co/Qwen/Qwen1.5-7B)|
+|qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-14B](https://huggingface.co/Qwen/Qwen1.5-14B)|
+|qwen1half-32b|[qwen/Qwen1.5-32B](https://modelscope.cn/models/qwen/Qwen1.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-32B](https://huggingface.co/Qwen/Qwen1.5-32B)|
+|qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-72B](https://huggingface.co/Qwen/Qwen1.5-72B)|
+|codeqwen1half-7b|[qwen/CodeQwen1.5-7B](https://modelscope.cn/models/qwen/CodeQwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|-|
+|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)|
+|qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat)|
+|qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat)|
+|qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat](https://huggingface.co/Qwen/Qwen1.5-4B-Chat)|
+|qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat)|
+|qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat](https://huggingface.co/Qwen/Qwen1.5-14B-Chat)|
+|qwen1half-32b-chat|[qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-32B-Chat](https://huggingface.co/Qwen/Qwen1.5-32B-Chat)|
+|qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat](https://huggingface.co/Qwen/Qwen1.5-72B-Chat)|
+|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)|
+|codeqwen1half-7b-chat|[qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|-|
+|qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4)|
+|qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4)|
+|qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int4)|
+|qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)|
+|qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4)|
+|qwen1half-32b-chat-int4|[qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-GPTQ-Int4)|
+|qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int4)|
+|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8)|
+|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8)|
+|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int8)|
+|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int8)|
+|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int8)|
+|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int8)|
+|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)|
+|qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-0.5B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-AWQ)|
+|qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-1.8B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-AWQ)|
+|qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-4B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-AWQ)|
+|qwen1half-7b-chat-awq|[qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ)|
+|qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-14B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-AWQ)|
+|qwen1half-32b-chat-awq|[qwen/Qwen1.5-32B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-32B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-AWQ)|
+|qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-72B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-AWQ)|
+|codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|-|
+|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||multi-modal, vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||multi-modal, vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
+|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|multi-modal, vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
+|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||multi-modal, audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
+|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||multi-modal, audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
+|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)|
+|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)|
+|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;||-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)|
+|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||-|[THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b)|
+|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||-|[THUDM/chatglm3-6b-32k](https://huggingface.co/THUDM/chatglm3-6b-32k)|
+|codegeex2-6b|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;|transformers<4.34|coding|[THUDM/codegeex2-6b](https://huggingface.co/THUDM/codegeex2-6b)|
+|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)|
+|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|
+|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf)|
+|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)|
+|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)|
+|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)|
+|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
+|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
+|llava1d6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|&#x2714;|&#x2718;||multi-modal, vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)|
+|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B)|
+|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)|
+|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||-|[01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat)|
+|yi-9b|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-9B](https://huggingface.co/01-ai/Yi-9B)|
+|yi-9b-200k|[01ai/Yi-9B-200K](https://modelscope.cn/models/01ai/Yi-9B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-9B-200K](https://huggingface.co/01-ai/Yi-9B-200K)|
+|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-34B](https://huggingface.co/01-ai/Yi-34B)|
+|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-34B-200K](https://huggingface.co/01-ai/Yi-34B-200K)|
+|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||-|[01-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat)|
+|yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)|
+|yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
+|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||-|[internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b)|
+|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|[internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)|
+|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|-|
+|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)|
+|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)|
+|internlm2-1_8b|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b)|
+|internlm2-1_8b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-1_8b-sft](https://huggingface.co/internlm/internlm2-chat-1_8b-sft)|
+|internlm2-1_8b-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-1_8b](https://huggingface.co/internlm/internlm2-chat-1_8b)|
+|internlm2-7b-base|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-base-7b](https://huggingface.co/internlm/internlm2-base-7b)|
+|internlm2-7b|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-7b](https://huggingface.co/internlm/internlm2-7b)|
+|internlm2-7b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-7b-sft](https://huggingface.co/internlm/internlm2-chat-7b-sft)|
+|internlm2-7b-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b)|
+|internlm2-20b-base|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-base-20b](https://huggingface.co/internlm/internlm2-base-20b)|
+|internlm2-20b|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)|
+|internlm2-20b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-20b-sft](https://huggingface.co/internlm/internlm2-chat-20b-sft)|
+|internlm2-20b-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)|
+|internlm2-math-7b|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||math|[internlm/internlm2-math-base-7b](https://huggingface.co/internlm/internlm2-math-base-7b)|
+|internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||math|[internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b)|
+|internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||math|[internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b)|
+|internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||math|[internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b)|
+|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|&#x2714;|&#x2718;||multi-modal, vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
+|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-7b-base](https://huggingface.co/deepseek-ai/deepseek-llm-7b-base)|
+|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-7b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)|
+|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-moe-16b-base](https://huggingface.co/deepseek-ai/deepseek-moe-16b-base)|
+|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-moe-16b-chat](https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat)|
+|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-67b-base](https://huggingface.co/deepseek-ai/deepseek-llm-67b-base)|
+|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat)|
+|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-1.3b-base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base)|
+|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-1.3b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-instruct)|
+|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-6.7b-base](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base)|
+|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)|
+|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)|
+|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)|
+|deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)|
+|deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)|
+|deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)|
+|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||multi-modal, vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
+|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||multi-modal, vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
+|gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b](https://huggingface.co/google/gemma-2b)|
+|gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)|
+|gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)|
+|gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)|
+|minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|[openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)|
+|minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|[openbmb/MiniCPM-2B-sft-fp32](https://huggingface.co/openbmb/MiniCPM-2B-sft-fp32)|
+|minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|[openbmb/MiniCPM-2B-dpo-fp32](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)|
+|minicpm-2b-128k|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k/summary)|q_proj, k_proj, v_proj|chatml|&#x2714;|&#x2714;|transformers>=4.36.0|-|[openbmb/MiniCPM-2B-128k](https://huggingface.co/openbmb/MiniCPM-2B-128k)|
+|minicpm-moe-8x2b|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|[openbmb/MiniCPM-MoE-8x2B](https://huggingface.co/openbmb/MiniCPM-MoE-8x2B)|
+|minicpm-v-3b-chat|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V/summary)|q_proj, k_proj, v_proj|minicpm-v|&#x2714;|&#x2718;||-|[openbmb/MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V)|
+|minicpm-v-v2|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|q_proj, k_proj, v_proj|minicpm-v|&#x2714;|&#x2718;||-|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)|
+|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://huggingface.co/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)|
+|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama-65b-v8-bf16)|
+|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)|
+|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|-|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mistral-7b-v17.1-32k)|
+|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|-|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://huggingface.co/OpenBuddy/openbuddy-zephyr-7b-v14.1)|
+|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://huggingface.co/OpenBuddy/openbuddy-deepseek-67b-v15.2)|
+|openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.36|-|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k)|
+|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|[mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)|
+|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|[alpindale/Mistral-7B-v0.2-hf](https://huggingface.co/alpindale/Mistral-7B-v0.2-hf)|
+|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|[mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)|
+|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|[mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)|
+|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)|
+|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.36|-|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)|
+|mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)|
+|mixtral-moe-8x22b-v1|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|[mistral-community/Mixtral-8x22B-v0.1](https://huggingface.co/mistral-community/Mixtral-8x22B-v0.1)|
+|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|-|[baichuan-inc/Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B)|
+|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|-|[baichuan-inc/Baichuan-13B-Base](https://huggingface.co/baichuan-inc/Baichuan-13B-Base)|
+|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;|transformers<4.34|-|[baichuan-inc/Baichuan-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat)|
+|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||-|[baichuan-inc/Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)|
+|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||-|[baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)|
+|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|-|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat-4bits)|
+|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||-|[baichuan-inc/Baichuan2-13B-Base](https://huggingface.co/baichuan-inc/Baichuan2-13B-Base)|
+|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||-|[baichuan-inc/Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat)|
+|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|-|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat-4bits)|
+|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|transformers<4.35, icecream|-|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
+|mplug-owl2d1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|transformers<4.35, icecream|-|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
+|yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)|
+|yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|[IEITYuan/Yuan2-2B-Janus-hf](https://huggingface.co/IEITYuan/Yuan2-2B-Janus-hf)|
+|yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)|
+|yuan2-102b-instruct|[YuanLLM/Yuan2.0-102B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-102B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)|
+|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-7B](https://huggingface.co/xverse/XVERSE-7B)|
+|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|[xverse/XVERSE-7B-Chat](https://huggingface.co/xverse/XVERSE-7B-Chat)|
+|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-13B](https://huggingface.co/xverse/XVERSE-13B)|
+|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|[xverse/XVERSE-13B-Chat](https://huggingface.co/xverse/XVERSE-13B-Chat)|
+|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-65B](https://huggingface.co/xverse/XVERSE-65B)|
+|xverse-65b-v2|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-65B-2](https://huggingface.co/xverse/XVERSE-65B-2)|
+|xverse-65b-chat|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|[xverse/XVERSE-65B-Chat](https://huggingface.co/xverse/XVERSE-65B-Chat)|
+|xverse-13b-256k|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-13B-256K](https://huggingface.co/xverse/XVERSE-13B-256K)|
+|xverse-moe-a4_2b|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-MoE-A4.2B](https://huggingface.co/xverse/XVERSE-MoE-A4.2B)|
+|orion-14b|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2718;||-|[OrionStarAI/Orion-14B-Base](https://huggingface.co/OrionStarAI/Orion-14B-Base)|
+|orion-14b-chat|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)|q_proj, k_proj, v_proj|orion|&#x2714;|&#x2718;||-|[OrionStarAI/Orion-14B-Chat](https://huggingface.co/OrionStarAI/Orion-14B-Chat)|
+|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|[vivo-ai/BlueLM-7B-Base](https://huggingface.co/vivo-ai/BlueLM-7B-Base)|
+|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|[vivo-ai/BlueLM-7B-Base-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Base-32K)|
+|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||-|[vivo-ai/BlueLM-7B-Chat](https://huggingface.co/vivo-ai/BlueLM-7B-Chat)|
+|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||-|[vivo-ai/BlueLM-7B-Chat-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Chat-32K)|
+|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[IDEA-CCNL/Ziya2-13B-Base](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Base)|
+|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|&#x2714;|&#x2714;||-|[IDEA-CCNL/Ziya2-13B-Chat](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Chat)|
+|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|[Skywork/Skywork-13B-base](https://huggingface.co/Skywork/Skywork-13B-base)|
+|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|&#x2718;|&#x2718;||-|-|
+|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|&#x2714;|&#x2714;|transformers>=4.34|-|[HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)|
+|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|&#x2718;|&#x2718;||-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
+|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|&#x2718;|&#x2714;||-|[DAMO-NLP/SeqGPT-560M](https://huggingface.co/DAMO-NLP/SeqGPT-560M)|
+|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|&#x2714;|&#x2714;||-|[SUSTech/SUS-Chat-34B](https://huggingface.co/SUSTech/SUS-Chat-34B)|
+|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||financial|-|
+|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||financial|[jxy/Tongyi-Finance-14B-Chat](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat)|
+|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|financial|[jxy/Tongyi-Finance-14B-Chat-Int4](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat-Int4)|
+|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|&#x2714;|&#x2714;||coding|[codefuse-ai/CodeFuse-CodeLlama-34B](https://huggingface.co/codefuse-ai/CodeFuse-CodeLlama-34B)|
+|codefuse-codegeex2-6b-chat|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B/summary)|query_key_value|codefuse|&#x2718;|&#x2714;|transformers<4.34|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|
+|codefuse-qwen-14b-chat|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B/summary)|c_attn|codefuse|&#x2714;|&#x2714;||coding|[codefuse-ai/CodeFuse-QWen-14B](https://huggingface.co/codefuse-ai/CodeFuse-QWen-14B)|
+|phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|&#x2714;|&#x2714;||coding|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)|
+|cogvlm-17b-instruct|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense|cogvlm-instruct|&#x2718;|&#x2718;||multi-modal, vision|[THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)|
+|cogagent-18b-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-chat|&#x2718;|&#x2718;||multi-modal, vision|[THUDM/cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)|
+|cogagent-18b-instruct|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-instruct|&#x2718;|&#x2718;||multi-modal, vision|[THUDM/cogagent-vqa-hf](https://huggingface.co/THUDM/cogagent-vqa-hf)|
+|mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)|
+|mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)|
+|mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)|
+|mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-790m-hf](https://huggingface.co/state-spaces/mamba-790m-hf)|
+|mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)|
+|mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
+|telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)|
+|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)|
+|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)|
+|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)|
+|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)|
+|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|&#x2714;|&#x2714;||-|[Langboat/Mengzi3-13B-Base](https://huggingface.co/Langboat/Mengzi3-13B-Base)|
+|c4ai-command-r-v01|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01/summary)|q_proj, k_proj, v_proj|c4ai|&#x2714;|&#x2718;|transformers>=4.39.1|-|[CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)|
+|c4ai-command-r-plus|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus/summary)|q_proj, k_proj, v_proj|c4ai|&#x2714;|&#x2718;|transformers>4.39|-|[CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)|
 
 
 ## 数据集
@@ -233,95 +235,95 @@
 - Size: 数据集中的数据样本数量.
 - Statistic: 数据集的统计量. 我们使用token数进行统计, 这对于调整`max_length`超参数有帮助. 我们将数据集的训练集和验证集进行拼接, 然后进行统计. 我们使用qwen的tokenizer对数据集进行分词. 不同的tokenizer的统计量不同, 如果你要获取其他的模型的tokenizer的token统计量, 可以通过[脚本](https://github.com/modelscope/swift/tree/main/scripts/utils/run_dataset_info.py)自行获取.
 
-| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags |
-| ------------ | ---------- | ---------- | -------- | ----------------- | ---- |
-|🔥ms-bench|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|316228|0|345.0±441.3, min=22, max=30960|chat, general, multi-round|
-|🔥ms-bench-mini|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|19492|0|353.9±439.4, min=29, max=12078|chat, general, multi-round|
-|🔥alpaca-en|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)|52002|0|176.2±125.8, min=26, max=740|chat, general|
-|🔥alpaca-zh|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)|48818|0|162.1±93.9, min=26, max=856|chat, general|
-|multi-alpaca-all|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary)|131867|0|112.9±50.6, min=26, max=1226|chat, general, multilingual|
-|instinwild-en|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|52191|0|160.2±69.7, min=33, max=763|chat, general|
-|instinwild-zh|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|51504|0|130.3±45.1, min=28, max=1434|chat, general|
-|cot-en|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT/summary)|74771|0|122.7±64.8, min=51, max=8320|chat, general|
-|cot-zh|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh/summary)|74771|0|117.5±70.8, min=43, max=9636|chat, general|
-|firefly-all-zh|[wyj123456/firefly](https://modelscope.cn/datasets/wyj123456/firefly/summary)|1649399|0|178.1±260.4, min=26, max=12516|chat, general|
-|instruct-en|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct/summary)|888970|0|268.9±331.2, min=26, max=7252|chat, general|
-|gpt4all-en|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all/summary)|806199|0|302.5±384.1, min=27, max=7391|chat, general|
-|sharegpt-en|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|99799|0|1045.7±431.9, min=22, max=7907|chat, general, multi-round|
-|sharegpt-zh|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|135399|0|806.3±771.7, min=21, max=65318|chat, general, multi-round|
-|tulu-v2-sft-mixture|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary)|326154|0|867.8±996.4, min=22, max=12111|chat, multilingual, general, multi-round|
-|wikipedia-zh|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary)|254547|0|568.4±713.2, min=37, max=78678|text-generation, general, pretrained|
-|open-orca|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|3239027|0|360.4±402.9, min=27, max=8672|chat, multilingual, general|
-|open-orca-gpt4|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|994896|0|382.3±417.4, min=31, max=8740|chat, multilingual, general|
-|sharegpt-gpt4|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|103063|0|1286.2±2089.4, min=22, max=221080|chat, multilingual, general, multi-round|
-|🔥sharegpt-gpt4-mini|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|6205|0|3511.6±6068.5, min=33, max=116018|chat, multilingual, general, multi-round, gpt4|
-|🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)|30000|0|647.7±217.1, min=199, max=2722|chat, agent, multi-round|
-|ms-agent-for-agentfabric-default|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|30000|0|617.8±199.1, min=251, max=2657|chat, agent, multi-round|
-|ms-agent-for-agentfabric-addition|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|488|0|2084.9±1514.8, min=489, max=7354|chat, agent, multi-round|
-|damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|422115|161|965.7±440.9, min=321, max=31535|chat, agent, multi-round|
-|damo-agent-mini-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|39964|152|1230.9±350.1, min=558, max=4982|chat, agent, multi-round|
-|agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|1866|0|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|
-|code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)|20016|0|100.1±60.1, min=29, max=1776|chat, coding|
-|🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)|2359|0|723.8±233.5, min=259, max=2117|chat, coding|
-|🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)|27224|0|483.6±193.9, min=45, max=3082|chat, coding|
-|🔥codefuse-evol-instruction-zh|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)|66862|0|439.6±206.3, min=37, max=2983|chat, coding|
-|medical-en|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|117117|500|257.4±89.1, min=36, max=2564|chat, medical|
-|medical-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|1950472|500|167.2±219.7, min=26, max=27351|chat, medical|
-|medical-mini-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|50000|500|168.1±220.8, min=26, max=12320|chat, medical|
-|🔥disc-med-sft-zh|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT/summary)|441767|0|354.1±193.1, min=25, max=2231|chat, medical|
-|lawyer-llama-zh|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary)|21476|0|194.4±91.7, min=27, max=924|chat, law|
-|tigerbot-law-zh|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)|55895|0|109.9±126.4, min=37, max=18878|text-generation, law, pretrained|
-|🔥disc-law-sft-zh|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT/summary)|166758|0|533.7±495.4, min=30, max=15169|chat, law|
-|🔥blossom-math-zh|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary)|10000|0|169.3±58.7, min=35, max=563|chat, math|
-|school-math-zh|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)|248480|0|157.6±72.1, min=33, max=3450|chat, math|
-|open-platypus-en|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus/summary)|24926|0|367.9±254.8, min=30, max=3951|chat, math|
-|text2sql-en|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary)|25000|0|274.6±326.4, min=38, max=1975|chat, sql|
-|🔥sql-create-context-en|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)|78577|0|80.2±17.8, min=36, max=456|chat, sql|
-|🔥advertise-gen-zh|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)|97484|915|131.6±21.7, min=52, max=242|text-generation|
-|🔥dureader-robust-zh|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)|15937|1962|242.1±137.4, min=61, max=1417|text-generation|
-|cmnli-zh|[clue](https://modelscope.cn/datasets/clue/summary)|391783|12241|83.6±16.6, min=52, max=200|text-generation, classification|
-|🔥cmnli-mini-zh|[clue](https://modelscope.cn/datasets/clue/summary)|20000|200|82.9±16.3, min=52, max=188|text-generation, classification|
-|🔥jd-sentiment-zh|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)|45012|4988|67.0±83.2, min=40, max=4040|text-generation, classification|
-|🔥hc3-zh|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese/summary)|39781|0|177.8±81.5, min=58, max=3052|text-generation, classification|
-|🔥hc3-en|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3/summary)|11021|0|299.3±138.7, min=66, max=2268|text-generation, classification|
-|finance-en|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en/summary)|68911|0|135.6±134.3, min=26, max=3525|chat, financial|
-|poetry-zh|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary)|388599|1710|55.2±9.4, min=23, max=83|text-generation, poetry|
-|webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)|50000|0|1478.9±11526.1, min=100, max=490484|chat, novel|
-|generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)|396004|0|273.3±52.0, min=32, max=873|chat, character-dialogue|
-|cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)|4959|0|3234.4±2547.5, min=91, max=19548|chat, classification|
-|ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner|
-|long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)|11998|0|9619.0±8295.8, min=36, max=78925|longlora, QA|
-|coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision|
-|🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision|
-|🔥coco-mini-en-2|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|36.8±2.8, min=32, max=77|chat, multi-modal, vision|
-|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision|
-|aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio|
-|🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio|
-|hh-rlhf-harmless-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42462|2308|167.2±123.1, min=22, max=986|rlhf, dpo, pairwise|
-|hh-rlhf-helpful-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|43777|2348|201.9±135.2, min=25, max=1070|rlhf, dpo, pairwise|
-|hh-rlhf-helpful-online|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|10150|1137|401.5±278.7, min=32, max=1987|rlhf, dpo, pairwise|
-|hh-rlhf-helpful-rejection-sampled|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|
-|hh-rlhf-red-team-attempts|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|
-|🔥hh-rlhf-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|172085|9292|172.8±124.0, min=22, max=1638|rlhf, dpo, pairwise|
-|hh-rlhf-cn-harmless-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|143.9±109.4, min=24, max=3078|rlhf, dpo, pairwise|
-|hh-rlhf-cn-helpful-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|176.8±120.0, min=26, max=1420|rlhf, dpo, pairwise|
-|hh-rlhf-cn-harmless-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|167.5±123.2, min=22, max=986|rlhf, dpo, pairwise|
-|hh-rlhf-cn-helpful-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|202.2±135.3, min=25, max=1070|rlhf, dpo, pairwise|
-|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|
-|pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)|214670|0|1612.3±8856.2, min=11, max=1208955|text-generation, awq|
-|🔥coig-cqia-chinese-traditional|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1111|0|172.6±59.9, min=55, max=856|general|
-|🔥coig-cqia-coig-pc|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3000|0|353.5±859.6, min=34, max=19288|general|
-|🔥coig-cqia-exam|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|4856|0|275.0±240.0, min=45, max=4932|general|
-|🔥coig-cqia-finance|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|11288|0|1266.4±561.1, min=60, max=10582|general|
-|🔥coig-cqia-douban|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3086|0|402.9±544.7, min=88, max=10870|general|
-|🔥coig-cqia-human-value|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1007|0|151.2±77.3, min=39, max=656|general|
-|🔥coig-cqia-logi-qa|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|421|0|309.8±188.8, min=43, max=1306|general|
-|🔥coig-cqia-ruozhiba|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|240|0|189.8±62.2, min=33, max=505|general|
-|🔥coig-cqia-segmentfault|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|458|0|449.0±495.8, min=87, max=6342|general|
-|🔥coig-cqia-wiki|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|10603|0|619.2±515.8, min=73, max=10140|general|
-|🔥coig-cqia-wikihow|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1485|0|1700.0±790.9, min=260, max=6371|general|
-|🔥coig-cqia-xhs|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1508|0|438.0±179.6, min=129, max=2191|general|
-|🔥coig-cqia-zhihu|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|5631|0|540.7±306.7, min=161, max=3036|general|
-|🔥ruozhiba-post-annual|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|1361|0|36.6±15.3, min=24, max=559|pretrain|
-|🔥ruozhiba-title-good|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|2597|0|41.9±19.3, min=22, max=246|pretrain|
-|🔥ruozhiba-title-norm|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|81700|0|39.9±12.8, min=21, max=386|pretrain|
+| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags | HF Dataset ID |
+| ------------ | ---------- | ---------- | -------- | ----------------- | ---- | ------------- |
+|🔥ms-bench|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|316228|0|345.0±441.3, min=22, max=30960|chat, general, multi-round|-|
+|🔥ms-bench-mini|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|19492|0|353.9±439.4, min=29, max=12078|chat, general, multi-round|-|
+|🔥alpaca-en|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)|52002|0|176.2±125.8, min=26, max=740|chat, general|[vicgalle/alpaca-gpt4](https://huggingface.co/datasets/vicgalle/alpaca-gpt4)|
+|🔥alpaca-zh|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)|48818|0|162.1±93.9, min=26, max=856|chat, general|[c-s-ale/alpaca-gpt4-data-zh](https://huggingface.co/datasets/c-s-ale/alpaca-gpt4-data-zh)|
+|multi-alpaca-all|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary)|131867|0|112.9±50.6, min=26, max=1226|chat, general, multilingual|-|
+|instinwild-en|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|52191|0|160.2±69.7, min=33, max=763|chat, general|-|
+|instinwild-zh|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|51504|0|130.3±45.1, min=28, max=1434|chat, general|-|
+|cot-en|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT/summary)|74771|0|122.7±64.8, min=51, max=8320|chat, general|-|
+|cot-zh|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh/summary)|74771|0|117.5±70.8, min=43, max=9636|chat, general|-|
+|firefly-all-zh|[wyj123456/firefly](https://modelscope.cn/datasets/wyj123456/firefly/summary)|1649399|0|178.1±260.4, min=26, max=12516|chat, general|-|
+|instruct-en|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct/summary)|888970|0|268.9±331.2, min=26, max=7252|chat, general|-|
+|gpt4all-en|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all/summary)|806199|0|302.5±384.1, min=27, max=7391|chat, general|-|
+|sharegpt-en|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|99799|0|1045.7±431.9, min=22, max=7907|chat, general, multi-round|-|
+|sharegpt-zh|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|135399|0|806.3±771.7, min=21, max=65318|chat, general, multi-round|-|
+|tulu-v2-sft-mixture|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary)|326154|0|867.8±996.4, min=22, max=12111|chat, multilingual, general, multi-round|[allenai/tulu-v2-sft-mixture](https://huggingface.co/datasets/allenai/tulu-v2-sft-mixture)|
+|wikipedia-zh|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary)|254547|0|568.4±713.2, min=37, max=78678|text-generation, general, pretrained|[pleisto/wikipedia-cn-20230720-filtered](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)|
+|open-orca|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|3239027|0|360.4±402.9, min=27, max=8672|chat, multilingual, general|-|
+|open-orca-gpt4|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|994896|0|382.3±417.4, min=31, max=8740|chat, multilingual, general|-|
+|sharegpt-gpt4|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|103063|0|1286.2±2089.4, min=22, max=221080|chat, multilingual, general, multi-round|-|
+|🔥sharegpt-gpt4-mini|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|6205|0|3511.6±6068.5, min=33, max=116018|chat, multilingual, general, multi-round, gpt4|-|
+|🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)|30000|0|647.7±217.1, min=199, max=2722|chat, agent, multi-round|-|
+|ms-agent-for-agentfabric-default|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|30000|0|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-|
+|ms-agent-for-agentfabric-addition|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|488|0|2084.9±1514.8, min=489, max=7354|chat, agent, multi-round|-|
+|damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|422115|161|965.7±440.9, min=321, max=31535|chat, agent, multi-round|-|
+|damo-agent-mini-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|39964|152|1230.9±350.1, min=558, max=4982|chat, agent, multi-round|-|
+|agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|1866|0|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-|
+|code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)|20016|0|100.1±60.1, min=29, max=1776|chat, coding|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)|
+|🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)|2359|0|723.8±233.5, min=259, max=2117|chat, coding|-|
+|🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)|27224|0|483.6±193.9, min=45, max=3082|chat, coding|-|
+|🔥codefuse-evol-instruction-zh|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)|66862|0|439.6±206.3, min=37, max=2983|chat, coding|-|
+|medical-en|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|117117|500|257.4±89.1, min=36, max=2564|chat, medical|-|
+|medical-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|1950472|500|167.2±219.7, min=26, max=27351|chat, medical|-|
+|medical-mini-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|50000|500|168.1±220.8, min=26, max=12320|chat, medical|-|
+|🔥disc-med-sft-zh|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT/summary)|441767|0|354.1±193.1, min=25, max=2231|chat, medical|[Flmc/DISC-Med-SFT](https://huggingface.co/datasets/Flmc/DISC-Med-SFT)|
+|lawyer-llama-zh|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary)|21476|0|194.4±91.7, min=27, max=924|chat, law|[Skepsun/lawyer_llama_data](https://huggingface.co/datasets/Skepsun/lawyer_llama_data)|
+|tigerbot-law-zh|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)|55895|0|109.9±126.4, min=37, max=18878|text-generation, law, pretrained|[TigerResearch/tigerbot-law-plugin](https://huggingface.co/datasets/TigerResearch/tigerbot-law-plugin)|
+|🔥disc-law-sft-zh|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT/summary)|166758|0|533.7±495.4, min=30, max=15169|chat, law|-|
+|🔥blossom-math-zh|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary)|10000|0|169.3±58.7, min=35, max=563|chat, math|[Azure99/blossom-math-v2](https://huggingface.co/datasets/Azure99/blossom-math-v2)|
+|school-math-zh|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)|248480|0|157.6±72.1, min=33, max=3450|chat, math|[BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)|
+|open-platypus-en|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus/summary)|24926|0|367.9±254.8, min=30, max=3951|chat, math|[garage-bAInd/Open-Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)|
+|text2sql-en|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary)|25000|0|274.6±326.4, min=38, max=1975|chat, sql|[Clinton/texttosqlv2_25000_v2](https://huggingface.co/datasets/Clinton/texttosqlv2_25000_v2)|
+|🔥sql-create-context-en|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)|78577|0|80.2±17.8, min=36, max=456|chat, sql|[b-mc2/sql-create-context](https://huggingface.co/datasets/b-mc2/sql-create-context)|
+|🔥advertise-gen-zh|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)|97484|915|131.6±21.7, min=52, max=242|text-generation|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)|
+|🔥dureader-robust-zh|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)|15937|1962|242.1±137.4, min=61, max=1417|text-generation|-|
+|cmnli-zh|[clue](https://modelscope.cn/datasets/clue/summary)|391783|12241|83.6±16.6, min=52, max=200|text-generation, classification|[clue](https://huggingface.co/datasets/clue)|
+|🔥cmnli-mini-zh|[clue](https://modelscope.cn/datasets/clue/summary)|20000|200|82.9±16.3, min=52, max=188|text-generation, classification|[clue](https://huggingface.co/datasets/clue)|
+|🔥jd-sentiment-zh|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)|45012|4988|67.0±83.2, min=40, max=4040|text-generation, classification|-|
+|🔥hc3-zh|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese/summary)|39781|0|177.8±81.5, min=58, max=3052|text-generation, classification|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)|
+|🔥hc3-en|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3/summary)|11021|0|299.3±138.7, min=66, max=2268|text-generation, classification|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)|
+|finance-en|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en/summary)|68911|0|135.6±134.3, min=26, max=3525|chat, financial|[ssbuild/alpaca_finance_en](https://huggingface.co/datasets/ssbuild/alpaca_finance_en)|
+|poetry-zh|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary)|388599|1710|55.2±9.4, min=23, max=83|text-generation, poetry|-|
+|webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)|50000|0|1478.9±11526.1, min=100, max=490484|chat, novel|[zxbsmk/webnovel_cn](https://huggingface.co/datasets/zxbsmk/webnovel_cn)|
+|generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)|396004|0|273.3±52.0, min=32, max=873|chat, character-dialogue|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|
+|cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)|4959|0|3234.4±2547.5, min=91, max=19548|chat, classification|-|
+|ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner|-|
+|long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)|11998|0|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)|
+|coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision|-|
+|🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision|-|
+|🔥coco-mini-en-2|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|36.8±2.8, min=32, max=77|chat, multi-modal, vision|-|
+|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision|-|
+|aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio|-|
+|🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio|-|
+|hh-rlhf-harmless-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42462|2308|167.2±123.1, min=22, max=986|rlhf, dpo, pairwise|-|
+|hh-rlhf-helpful-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|43777|2348|201.9±135.2, min=25, max=1070|rlhf, dpo, pairwise|-|
+|hh-rlhf-helpful-online|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|10150|1137|401.5±278.7, min=32, max=1987|rlhf, dpo, pairwise|-|
+|hh-rlhf-helpful-rejection-sampled|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|-|
+|hh-rlhf-red-team-attempts|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|-|
+|🔥hh-rlhf-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|172085|9292|172.8±124.0, min=22, max=1638|rlhf, dpo, pairwise|-|
+|hh-rlhf-cn-harmless-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|143.9±109.4, min=24, max=3078|rlhf, dpo, pairwise|-|
+|hh-rlhf-cn-helpful-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|176.8±120.0, min=26, max=1420|rlhf, dpo, pairwise|-|
+|hh-rlhf-cn-harmless-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|167.5±123.2, min=22, max=986|rlhf, dpo, pairwise|-|
+|hh-rlhf-cn-helpful-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|202.2±135.3, min=25, max=1070|rlhf, dpo, pairwise|-|
+|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|-|
+|pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)|214670|0|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
+|🔥coig-cqia-chinese-traditional|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1111|0|172.6±59.9, min=55, max=856|general|-|
+|🔥coig-cqia-coig-pc|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3000|0|353.5±859.6, min=34, max=19288|general|-|
+|🔥coig-cqia-exam|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|4856|0|275.0±240.0, min=45, max=4932|general|-|
+|🔥coig-cqia-finance|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|11288|0|1266.4±561.1, min=60, max=10582|general|-|
+|🔥coig-cqia-douban|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3086|0|402.9±544.7, min=88, max=10870|general|-|
+|🔥coig-cqia-human-value|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1007|0|151.2±77.3, min=39, max=656|general|-|
+|🔥coig-cqia-logi-qa|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|421|0|309.8±188.8, min=43, max=1306|general|-|
+|🔥coig-cqia-ruozhiba|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|240|0|189.8±62.2, min=33, max=505|general|-|
+|🔥coig-cqia-segmentfault|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|458|0|449.0±495.8, min=87, max=6342|general|-|
+|🔥coig-cqia-wiki|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|10603|0|619.2±515.8, min=73, max=10140|general|-|
+|🔥coig-cqia-wikihow|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1485|0|1700.0±790.9, min=260, max=6371|general|-|
+|🔥coig-cqia-xhs|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1508|0|438.0±179.6, min=129, max=2191|general|-|
+|🔥coig-cqia-zhihu|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|5631|0|540.7±306.7, min=161, max=3036|general|-|
+|🔥ruozhiba-post-annual|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|1361|0|36.6±15.3, min=24, max=559|pretrain|-|
+|🔥ruozhiba-title-good|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|2597|0|41.9±19.3, min=22, max=246|pretrain|-|
+|🔥ruozhiba-title-norm|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|81700|0|39.9±12.8, min=21, max=386|pretrain|-|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index 2737c214ec..414c714d9d 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -12,218 +12,220 @@ The table below introcudes all models supported by SWIFT:
 - Support VLLM: Whether the model supports [vllm](https://github.com/vllm-project/vllm) to accelerate infer and deployment.
 - Requires: The extra requirements used by the model.
 
-| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags |
-| ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- |
-|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|
-|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|
-|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|
-|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|
-|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|
-|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|
-|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|
-|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|
-|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|
-|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|
-|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|
-|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|
-|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|
-|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|
-|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|
-|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|
-|qwen1half-0_5b|[qwen/Qwen1.5-0.5B](https://modelscope.cn/models/qwen/Qwen1.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-1_8b|[qwen/Qwen1.5-1.8B](https://modelscope.cn/models/qwen/Qwen1.5-1.8B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-4b|[qwen/Qwen1.5-4B](https://modelscope.cn/models/qwen/Qwen1.5-4B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-32b|[qwen/Qwen1.5-32B](https://modelscope.cn/models/qwen/Qwen1.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|codeqwen1half-7b|[qwen/CodeQwen1.5-7B](https://modelscope.cn/models/qwen/CodeQwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-32b-chat|[qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|codeqwen1half-7b-chat|[qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
-|qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-32b-chat-int4|[qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
-|qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-7b-chat-awq|[qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
-|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||multi-modal, vision|
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||multi-modal, vision|
-|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|multi-modal, vision|
-|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||multi-modal, audio|
-|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||multi-modal, audio|
-|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||-|
-|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||-|
-|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;||-|
-|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||-|
-|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||-|
-|codegeex2-6b|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;|transformers<4.34|coding|
-|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|
-|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|
-|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|
-|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|
-|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|
-|llava1d6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|&#x2714;|&#x2718;||multi-modal, vision|
-|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||-|
-|yi-9b|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|
-|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||-|
-|yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|
-|yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|
-|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||-|
-|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b-v1_1](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|
-|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|
-|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||-|
-|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|
-|internlm2-1_8b|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-1_8b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-1_8b-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-7b-base|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-7b|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-7b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-7b-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-20b-base|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-20b|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|
-|internlm2-20b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-20b-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|
-|internlm2-math-7b|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||math|
-|internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||math|
-|internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||math|
-|internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||math|
-|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|&#x2714;|&#x2718;||multi-modal, vision|
-|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|
-|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|
-|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|
-|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|
-|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|
-|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|
-|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|
-|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|
-|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|
-|deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||math|
-|deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|
-|deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|
-|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||multi-modal, vision|
-|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||multi-modal, vision|
-|gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.38|-|
-|gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.38|-|
-|gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|
-|gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|
-|minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|
-|minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|
-|minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|
-|minicpm-2b-128k|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k/summary)|q_proj, k_proj, v_proj|chatml|&#x2714;|&#x2714;|transformers>=4.36.0|-|
-|minicpm-moe-8x2b|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|
-|minicpm-v-3b-chat|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V/summary)|q_proj, k_proj, v_proj|minicpm-v|&#x2714;|&#x2718;||-|
-|minicpm-v-v2|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|q_proj, k_proj, v_proj|minicpm-v|&#x2714;|&#x2718;||-|
-|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
-|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
-|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
-|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|-|
-|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|-|
-|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
-|openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.36|-|
-|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|
-|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|
-|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|
-|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|
-|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|
-|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.36|-|
-|mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|
-|mixtral-moe-8x22b-v1|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|
-|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|-|
-|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|-|
-|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;|transformers<4.34|-|
-|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||-|
-|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||-|
-|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|-|
-|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||-|
-|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||-|
-|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|-|
-|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|transformers<4.35, icecream|-|
-|mplug-owl2d1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|transformers<4.35, icecream|-|
-|yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|
-|yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|
-|yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|
-|yuan2-102b-instruct|[YuanLLM/Yuan2.0-102B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-102B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|
-|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|
-|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|
-|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-65b-v2|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-65b-chat|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|
-|xverse-13b-256k|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|xverse-moe-a4_2b|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|orion-14b|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2718;||-|
-|orion-14b-chat|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)|q_proj, k_proj, v_proj|orion|&#x2714;|&#x2718;||-|
-|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|
-|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|
-|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||-|
-|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||-|
-|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|
-|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|&#x2714;|&#x2714;||-|
-|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|
-|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|&#x2718;|&#x2718;||-|
-|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|&#x2714;|&#x2714;|transformers>=4.34|-|
-|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|&#x2718;|&#x2718;||-|
-|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|&#x2718;|&#x2714;||-|
-|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|&#x2714;|&#x2714;||-|
-|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||financial|
-|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||financial|
-|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|financial|
-|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|&#x2714;|&#x2714;||coding|
-|codefuse-codegeex2-6b-chat|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B/summary)|query_key_value|codefuse|&#x2718;|&#x2714;|transformers<4.34|coding|
-|codefuse-qwen-14b-chat|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B/summary)|c_attn|codefuse|&#x2714;|&#x2714;||coding|
-|phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|&#x2714;|&#x2714;||coding|
-|cogvlm-17b-instruct|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense|cogvlm-instruct|&#x2718;|&#x2718;||multi-modal, vision|
-|cogagent-18b-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-chat|&#x2718;|&#x2718;||multi-modal, vision|
-|cogagent-18b-instruct|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-instruct|&#x2718;|&#x2718;||multi-modal, vision|
-|mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|
-|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|
-|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
-|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|
-|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|
-|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|&#x2714;|&#x2714;||-|
-|c4ai-command-r-v01|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01/summary)|q_proj, k_proj, v_proj|c4ai|&#x2714;|&#x2718;|transformers>=4.39.1|-|
-|c4ai-command-r-plus|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus/summary)|q_proj, k_proj, v_proj|c4ai|&#x2714;|&#x2718;|transformers>4.39|-|
+| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |
+| ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- |
+|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)|
+|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|[Qwen/Qwen-1_8B-Chat](https://huggingface.co/Qwen/Qwen-1_8B-Chat)|
+|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|[Qwen/Qwen-1_8B-Chat-Int4](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int4)|
+|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|[Qwen/Qwen-1_8B-Chat-Int8](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int8)|
+|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|[Qwen/Qwen-7B](https://huggingface.co/Qwen/Qwen-7B)|
+|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|[Qwen/Qwen-7B-Chat](https://huggingface.co/Qwen/Qwen-7B-Chat)|
+|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|[Qwen/Qwen-7B-Chat-Int4](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)|
+|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|[Qwen/Qwen-7B-Chat-Int8](https://huggingface.co/Qwen/Qwen-7B-Chat-Int8)|
+|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|[Qwen/Qwen-14B](https://huggingface.co/Qwen/Qwen-14B)|
+|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|[Qwen/Qwen-14B-Chat](https://huggingface.co/Qwen/Qwen-14B-Chat)|
+|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|[Qwen/Qwen-14B-Chat-Int4](https://huggingface.co/Qwen/Qwen-14B-Chat-Int4)|
+|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|[Qwen/Qwen-14B-Chat-Int8](https://huggingface.co/Qwen/Qwen-14B-Chat-Int8)|
+|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||-|[Qwen/Qwen-72B](https://huggingface.co/Qwen/Qwen-72B)|
+|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||-|[Qwen/Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat)|
+|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|-|[Qwen/Qwen-72B-Chat-Int4](https://huggingface.co/Qwen/Qwen-72B-Chat-Int4)|
+|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|-|[Qwen/Qwen-72B-Chat-Int8](https://huggingface.co/Qwen/Qwen-72B-Chat-Int8)|
+|qwen1half-0_5b|[qwen/Qwen1.5-0.5B](https://modelscope.cn/models/qwen/Qwen1.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-0.5B](https://huggingface.co/Qwen/Qwen1.5-0.5B)|
+|qwen1half-1_8b|[qwen/Qwen1.5-1.8B](https://modelscope.cn/models/qwen/Qwen1.5-1.8B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)|
+|qwen1half-4b|[qwen/Qwen1.5-4B](https://modelscope.cn/models/qwen/Qwen1.5-4B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B)|
+|qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-7B](https://huggingface.co/Qwen/Qwen1.5-7B)|
+|qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-14B](https://huggingface.co/Qwen/Qwen1.5-14B)|
+|qwen1half-32b|[qwen/Qwen1.5-32B](https://modelscope.cn/models/qwen/Qwen1.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-32B](https://huggingface.co/Qwen/Qwen1.5-32B)|
+|qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-72B](https://huggingface.co/Qwen/Qwen1.5-72B)|
+|codeqwen1half-7b|[qwen/CodeQwen1.5-7B](https://modelscope.cn/models/qwen/CodeQwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|-|
+|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)|
+|qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat)|
+|qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat)|
+|qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat](https://huggingface.co/Qwen/Qwen1.5-4B-Chat)|
+|qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat)|
+|qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat](https://huggingface.co/Qwen/Qwen1.5-14B-Chat)|
+|qwen1half-32b-chat|[qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-32B-Chat](https://huggingface.co/Qwen/Qwen1.5-32B-Chat)|
+|qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat](https://huggingface.co/Qwen/Qwen1.5-72B-Chat)|
+|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)|
+|codeqwen1half-7b-chat|[qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|-|
+|qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4)|
+|qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4)|
+|qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int4)|
+|qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)|
+|qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4)|
+|qwen1half-32b-chat-int4|[qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-GPTQ-Int4)|
+|qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int4)|
+|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8)|
+|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8)|
+|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int8)|
+|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int8)|
+|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int8)|
+|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int8)|
+|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)|
+|qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-0.5B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-AWQ)|
+|qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-1.8B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-AWQ)|
+|qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-4B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-AWQ)|
+|qwen1half-7b-chat-awq|[qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ)|
+|qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-14B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-AWQ)|
+|qwen1half-32b-chat-awq|[qwen/Qwen1.5-32B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-32B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-AWQ)|
+|qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-72B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-AWQ)|
+|codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|-|
+|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||multi-modal, vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||multi-modal, vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
+|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|multi-modal, vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
+|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||multi-modal, audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
+|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||multi-modal, audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
+|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)|
+|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)|
+|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;||-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)|
+|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||-|[THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b)|
+|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||-|[THUDM/chatglm3-6b-32k](https://huggingface.co/THUDM/chatglm3-6b-32k)|
+|codegeex2-6b|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;|transformers<4.34|coding|[THUDM/codegeex2-6b](https://huggingface.co/THUDM/codegeex2-6b)|
+|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)|
+|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|
+|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf)|
+|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)|
+|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)|
+|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||-|[meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)|
+|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
+|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
+|llava1d6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|&#x2714;|&#x2718;||multi-modal, vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)|
+|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B)|
+|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)|
+|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||-|[01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat)|
+|yi-9b|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-9B](https://huggingface.co/01-ai/Yi-9B)|
+|yi-9b-200k|[01ai/Yi-9B-200K](https://modelscope.cn/models/01ai/Yi-9B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-9B-200K](https://huggingface.co/01-ai/Yi-9B-200K)|
+|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-34B](https://huggingface.co/01-ai/Yi-34B)|
+|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-34B-200K](https://huggingface.co/01-ai/Yi-34B-200K)|
+|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||-|[01-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat)|
+|yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)|
+|yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
+|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||-|[internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b)|
+|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|[internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)|
+|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|-|
+|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)|
+|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)|
+|internlm2-1_8b|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b)|
+|internlm2-1_8b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-1_8b-sft](https://huggingface.co/internlm/internlm2-chat-1_8b-sft)|
+|internlm2-1_8b-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-1_8b](https://huggingface.co/internlm/internlm2-chat-1_8b)|
+|internlm2-7b-base|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-base-7b](https://huggingface.co/internlm/internlm2-base-7b)|
+|internlm2-7b|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-7b](https://huggingface.co/internlm/internlm2-7b)|
+|internlm2-7b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-7b-sft](https://huggingface.co/internlm/internlm2-chat-7b-sft)|
+|internlm2-7b-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b)|
+|internlm2-20b-base|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-base-20b](https://huggingface.co/internlm/internlm2-base-20b)|
+|internlm2-20b|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)|
+|internlm2-20b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-20b-sft](https://huggingface.co/internlm/internlm2-chat-20b-sft)|
+|internlm2-20b-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)|
+|internlm2-math-7b|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||math|[internlm/internlm2-math-base-7b](https://huggingface.co/internlm/internlm2-math-base-7b)|
+|internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||math|[internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b)|
+|internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation-bos|&#x2714;|&#x2714;||math|[internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b)|
+|internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;||math|[internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b)|
+|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|&#x2714;|&#x2718;||multi-modal, vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
+|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-7b-base](https://huggingface.co/deepseek-ai/deepseek-llm-7b-base)|
+|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-7b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)|
+|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-moe-16b-base](https://huggingface.co/deepseek-ai/deepseek-moe-16b-base)|
+|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-moe-16b-chat](https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat)|
+|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-67b-base](https://huggingface.co/deepseek-ai/deepseek-llm-67b-base)|
+|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat)|
+|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-1.3b-base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base)|
+|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-1.3b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-instruct)|
+|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-6.7b-base](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base)|
+|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)|
+|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)|
+|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)|
+|deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)|
+|deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)|
+|deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)|
+|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||multi-modal, vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
+|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||multi-modal, vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
+|gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b](https://huggingface.co/google/gemma-2b)|
+|gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)|
+|gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)|
+|gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|&#x2714;|&#x2714;|transformers>=4.38|-|[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)|
+|minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|[openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)|
+|minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|[openbmb/MiniCPM-2B-sft-fp32](https://huggingface.co/openbmb/MiniCPM-2B-sft-fp32)|
+|minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;||-|[openbmb/MiniCPM-2B-dpo-fp32](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)|
+|minicpm-2b-128k|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k/summary)|q_proj, k_proj, v_proj|chatml|&#x2714;|&#x2714;|transformers>=4.36.0|-|[openbmb/MiniCPM-2B-128k](https://huggingface.co/openbmb/MiniCPM-2B-128k)|
+|minicpm-moe-8x2b|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B/summary)|q_proj, k_proj, v_proj|minicpm|&#x2714;|&#x2714;|transformers>=4.36.0|-|[openbmb/MiniCPM-MoE-8x2B](https://huggingface.co/openbmb/MiniCPM-MoE-8x2B)|
+|minicpm-v-3b-chat|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V/summary)|q_proj, k_proj, v_proj|minicpm-v|&#x2714;|&#x2718;||-|[openbmb/MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V)|
+|minicpm-v-v2|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|q_proj, k_proj, v_proj|minicpm-v|&#x2714;|&#x2718;||-|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)|
+|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://huggingface.co/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)|
+|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama-65b-v8-bf16)|
+|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)|
+|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|-|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mistral-7b-v17.1-32k)|
+|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|-|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://huggingface.co/OpenBuddy/openbuddy-zephyr-7b-v14.1)|
+|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://huggingface.co/OpenBuddy/openbuddy-deepseek-67b-v15.2)|
+|openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.36|-|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k)|
+|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|[mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)|
+|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|[alpindale/Mistral-7B-v0.2-hf](https://huggingface.co/alpindale/Mistral-7B-v0.2-hf)|
+|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|[mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)|
+|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|[mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)|
+|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)|
+|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.36|-|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)|
+|mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)|
+|mixtral-moe-8x22b-v1|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|[mistral-community/Mixtral-8x22B-v0.1](https://huggingface.co/mistral-community/Mixtral-8x22B-v0.1)|
+|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|-|[baichuan-inc/Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B)|
+|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|-|[baichuan-inc/Baichuan-13B-Base](https://huggingface.co/baichuan-inc/Baichuan-13B-Base)|
+|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;|transformers<4.34|-|[baichuan-inc/Baichuan-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat)|
+|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||-|[baichuan-inc/Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)|
+|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||-|[baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)|
+|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|-|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat-4bits)|
+|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||-|[baichuan-inc/Baichuan2-13B-Base](https://huggingface.co/baichuan-inc/Baichuan2-13B-Base)|
+|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||-|[baichuan-inc/Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat)|
+|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|-|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat-4bits)|
+|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|transformers<4.35, icecream|-|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
+|mplug-owl2d1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|transformers<4.35, icecream|-|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
+|yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)|
+|yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|[IEITYuan/Yuan2-2B-Janus-hf](https://huggingface.co/IEITYuan/Yuan2-2B-Janus-hf)|
+|yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)|
+|yuan2-102b-instruct|[YuanLLM/Yuan2.0-102B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-102B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)|
+|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-7B](https://huggingface.co/xverse/XVERSE-7B)|
+|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|[xverse/XVERSE-7B-Chat](https://huggingface.co/xverse/XVERSE-7B-Chat)|
+|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-13B](https://huggingface.co/xverse/XVERSE-13B)|
+|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|[xverse/XVERSE-13B-Chat](https://huggingface.co/xverse/XVERSE-13B-Chat)|
+|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-65B](https://huggingface.co/xverse/XVERSE-65B)|
+|xverse-65b-v2|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-65B-2](https://huggingface.co/xverse/XVERSE-65B-2)|
+|xverse-65b-chat|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||-|[xverse/XVERSE-65B-Chat](https://huggingface.co/xverse/XVERSE-65B-Chat)|
+|xverse-13b-256k|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-13B-256K](https://huggingface.co/xverse/XVERSE-13B-256K)|
+|xverse-moe-a4_2b|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[xverse/XVERSE-MoE-A4.2B](https://huggingface.co/xverse/XVERSE-MoE-A4.2B)|
+|orion-14b|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2718;||-|[OrionStarAI/Orion-14B-Base](https://huggingface.co/OrionStarAI/Orion-14B-Base)|
+|orion-14b-chat|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)|q_proj, k_proj, v_proj|orion|&#x2714;|&#x2718;||-|[OrionStarAI/Orion-14B-Chat](https://huggingface.co/OrionStarAI/Orion-14B-Chat)|
+|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|[vivo-ai/BlueLM-7B-Base](https://huggingface.co/vivo-ai/BlueLM-7B-Base)|
+|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|[vivo-ai/BlueLM-7B-Base-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Base-32K)|
+|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||-|[vivo-ai/BlueLM-7B-Chat](https://huggingface.co/vivo-ai/BlueLM-7B-Chat)|
+|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||-|[vivo-ai/BlueLM-7B-Chat-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Chat-32K)|
+|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[IDEA-CCNL/Ziya2-13B-Base](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Base)|
+|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|&#x2714;|&#x2714;||-|[IDEA-CCNL/Ziya2-13B-Chat](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Chat)|
+|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||-|[Skywork/Skywork-13B-base](https://huggingface.co/Skywork/Skywork-13B-base)|
+|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|&#x2718;|&#x2718;||-|-|
+|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|&#x2714;|&#x2714;|transformers>=4.34|-|[HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)|
+|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|&#x2718;|&#x2718;||-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
+|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|&#x2718;|&#x2714;||-|[DAMO-NLP/SeqGPT-560M](https://huggingface.co/DAMO-NLP/SeqGPT-560M)|
+|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|&#x2714;|&#x2714;||-|[SUSTech/SUS-Chat-34B](https://huggingface.co/SUSTech/SUS-Chat-34B)|
+|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||financial|-|
+|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|qwen|&#x2714;|&#x2714;||financial|[jxy/Tongyi-Finance-14B-Chat](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat)|
+|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5|financial|[jxy/Tongyi-Finance-14B-Chat-Int4](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat-Int4)|
+|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|&#x2714;|&#x2714;||coding|[codefuse-ai/CodeFuse-CodeLlama-34B](https://huggingface.co/codefuse-ai/CodeFuse-CodeLlama-34B)|
+|codefuse-codegeex2-6b-chat|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B/summary)|query_key_value|codefuse|&#x2718;|&#x2714;|transformers<4.34|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)|
+|codefuse-qwen-14b-chat|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B/summary)|c_attn|codefuse|&#x2714;|&#x2714;||coding|[codefuse-ai/CodeFuse-QWen-14B](https://huggingface.co/codefuse-ai/CodeFuse-QWen-14B)|
+|phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|&#x2714;|&#x2714;||coding|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)|
+|cogvlm-17b-instruct|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense|cogvlm-instruct|&#x2718;|&#x2718;||multi-modal, vision|[THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)|
+|cogagent-18b-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-chat|&#x2718;|&#x2718;||multi-modal, vision|[THUDM/cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)|
+|cogagent-18b-instruct|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-instruct|&#x2718;|&#x2718;||multi-modal, vision|[THUDM/cogagent-vqa-hf](https://huggingface.co/THUDM/cogagent-vqa-hf)|
+|mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)|
+|mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)|
+|mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)|
+|mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-790m-hf](https://huggingface.co/state-spaces/mamba-790m-hf)|
+|mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)|
+|mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
+|telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)|
+|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|&#x2714;|&#x2718;||-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)|
+|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)|
+|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)|
+|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|&#x2714;|&#x2714;|transformers>=4.36|-|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)|
+|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|&#x2714;|&#x2714;||-|[Langboat/Mengzi3-13B-Base](https://huggingface.co/Langboat/Mengzi3-13B-Base)|
+|c4ai-command-r-v01|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01/summary)|q_proj, k_proj, v_proj|c4ai|&#x2714;|&#x2718;|transformers>=4.39.1|-|[CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)|
+|c4ai-command-r-plus|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus/summary)|q_proj, k_proj, v_proj|c4ai|&#x2714;|&#x2718;|transformers>4.39|-|[CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)|
 
 
 ## dataset
@@ -233,95 +235,95 @@ The table below introduces the datasets supported by SWIFT:
 - Size: The data row count of the dataset.
 - Statistic: Dataset statistics. We use the number of tokens for statistics, which helps adjust the max_length hyperparameter. We concatenate the training and validation sets of the dataset and then compute the statistics. We use qwen's tokenizer to tokenize the dataset. Different tokenizers produce different statistics. If you want to obtain token statistics for tokenizers of other models, you can use the script to get them yourself.
 
-| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags |
-| ------------ | ---------- | ---------- | -------- | ----------------- | ---- |
-|🔥ms-bench|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|316228|0|345.0±441.3, min=22, max=30960|chat, general, multi-round|
-|🔥ms-bench-mini|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|19492|0|353.9±439.4, min=29, max=12078|chat, general, multi-round|
-|🔥alpaca-en|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)|52002|0|176.2±125.8, min=26, max=740|chat, general|
-|🔥alpaca-zh|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)|48818|0|162.1±93.9, min=26, max=856|chat, general|
-|multi-alpaca-all|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary)|131867|0|112.9±50.6, min=26, max=1226|chat, general, multilingual|
-|instinwild-en|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|52191|0|160.2±69.7, min=33, max=763|chat, general|
-|instinwild-zh|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|51504|0|130.3±45.1, min=28, max=1434|chat, general|
-|cot-en|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT/summary)|74771|0|122.7±64.8, min=51, max=8320|chat, general|
-|cot-zh|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh/summary)|74771|0|117.5±70.8, min=43, max=9636|chat, general|
-|firefly-all-zh|[wyj123456/firefly](https://modelscope.cn/datasets/wyj123456/firefly/summary)|1649399|0|178.1±260.4, min=26, max=12516|chat, general|
-|instruct-en|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct/summary)|888970|0|268.9±331.2, min=26, max=7252|chat, general|
-|gpt4all-en|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all/summary)|806199|0|302.5±384.1, min=27, max=7391|chat, general|
-|sharegpt-en|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|99799|0|1045.7±431.9, min=22, max=7907|chat, general, multi-round|
-|sharegpt-zh|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|135399|0|806.3±771.7, min=21, max=65318|chat, general, multi-round|
-|tulu-v2-sft-mixture|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary)|326154|0|867.8±996.4, min=22, max=12111|chat, multilingual, general, multi-round|
-|wikipedia-zh|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary)|254547|0|568.4±713.2, min=37, max=78678|text-generation, general, pretrained|
-|open-orca|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|3239027|0|360.4±402.9, min=27, max=8672|chat, multilingual, general|
-|open-orca-gpt4|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|994896|0|382.3±417.4, min=31, max=8740|chat, multilingual, general|
-|sharegpt-gpt4|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|103063|0|1286.2±2089.4, min=22, max=221080|chat, multilingual, general, multi-round|
-|🔥sharegpt-gpt4-mini|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|6205|0|3511.6±6068.5, min=33, max=116018|chat, multilingual, general, multi-round, gpt4|
-|🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)|30000|0|647.7±217.1, min=199, max=2722|chat, agent, multi-round|
-|ms-agent-for-agentfabric-default|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|30000|0|617.8±199.1, min=251, max=2657|chat, agent, multi-round|
-|ms-agent-for-agentfabric-addition|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|488|0|2084.9±1514.8, min=489, max=7354|chat, agent, multi-round|
-|damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|422115|161|965.7±440.9, min=321, max=31535|chat, agent, multi-round|
-|damo-agent-mini-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|39964|152|1230.9±350.1, min=558, max=4982|chat, agent, multi-round|
-|agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|1866|0|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|
-|code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)|20016|0|100.1±60.1, min=29, max=1776|chat, coding|
-|🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)|2359|0|723.8±233.5, min=259, max=2117|chat, coding|
-|🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)|27224|0|483.6±193.9, min=45, max=3082|chat, coding|
-|🔥codefuse-evol-instruction-zh|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)|66862|0|439.6±206.3, min=37, max=2983|chat, coding|
-|medical-en|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|117117|500|257.4±89.1, min=36, max=2564|chat, medical|
-|medical-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|1950472|500|167.2±219.7, min=26, max=27351|chat, medical|
-|medical-mini-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|50000|500|168.1±220.8, min=26, max=12320|chat, medical|
-|🔥disc-med-sft-zh|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT/summary)|441767|0|354.1±193.1, min=25, max=2231|chat, medical|
-|lawyer-llama-zh|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary)|21476|0|194.4±91.7, min=27, max=924|chat, law|
-|tigerbot-law-zh|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)|55895|0|109.9±126.4, min=37, max=18878|text-generation, law, pretrained|
-|🔥disc-law-sft-zh|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT/summary)|166758|0|533.7±495.4, min=30, max=15169|chat, law|
-|🔥blossom-math-zh|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary)|10000|0|169.3±58.7, min=35, max=563|chat, math|
-|school-math-zh|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)|248480|0|157.6±72.1, min=33, max=3450|chat, math|
-|open-platypus-en|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus/summary)|24926|0|367.9±254.8, min=30, max=3951|chat, math|
-|text2sql-en|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary)|25000|0|274.6±326.4, min=38, max=1975|chat, sql|
-|🔥sql-create-context-en|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)|78577|0|80.2±17.8, min=36, max=456|chat, sql|
-|🔥advertise-gen-zh|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)|97484|915|131.6±21.7, min=52, max=242|text-generation|
-|🔥dureader-robust-zh|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)|15937|1962|242.1±137.4, min=61, max=1417|text-generation|
-|cmnli-zh|[clue](https://modelscope.cn/datasets/clue/summary)|391783|12241|83.6±16.6, min=52, max=200|text-generation, classification|
-|🔥cmnli-mini-zh|[clue](https://modelscope.cn/datasets/clue/summary)|20000|200|82.9±16.3, min=52, max=188|text-generation, classification|
-|🔥jd-sentiment-zh|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)|45012|4988|67.0±83.2, min=40, max=4040|text-generation, classification|
-|🔥hc3-zh|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese/summary)|39781|0|177.8±81.5, min=58, max=3052|text-generation, classification|
-|🔥hc3-en|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3/summary)|11021|0|299.3±138.7, min=66, max=2268|text-generation, classification|
-|finance-en|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en/summary)|68911|0|135.6±134.3, min=26, max=3525|chat, financial|
-|poetry-zh|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary)|388599|1710|55.2±9.4, min=23, max=83|text-generation, poetry|
-|webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)|50000|0|1478.9±11526.1, min=100, max=490484|chat, novel|
-|generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)|396004|0|273.3±52.0, min=32, max=873|chat, character-dialogue|
-|cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)|4959|0|3234.4±2547.5, min=91, max=19548|chat, classification|
-|ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner|
-|long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)|11998|0|9619.0±8295.8, min=36, max=78925|longlora, QA|
-|coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision|
-|🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision|
-|🔥coco-mini-en-2|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|36.8±2.8, min=32, max=77|chat, multi-modal, vision|
-|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision|
-|aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio|
-|🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio|
-|hh-rlhf-harmless-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42462|2308|167.2±123.1, min=22, max=986|rlhf, dpo, pairwise|
-|hh-rlhf-helpful-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|43777|2348|201.9±135.2, min=25, max=1070|rlhf, dpo, pairwise|
-|hh-rlhf-helpful-online|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|10150|1137|401.5±278.7, min=32, max=1987|rlhf, dpo, pairwise|
-|hh-rlhf-helpful-rejection-sampled|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|
-|hh-rlhf-red-team-attempts|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|
-|🔥hh-rlhf-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|172085|9292|172.8±124.0, min=22, max=1638|rlhf, dpo, pairwise|
-|hh-rlhf-cn-harmless-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|143.9±109.4, min=24, max=3078|rlhf, dpo, pairwise|
-|hh-rlhf-cn-helpful-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|176.8±120.0, min=26, max=1420|rlhf, dpo, pairwise|
-|hh-rlhf-cn-harmless-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|167.5±123.2, min=22, max=986|rlhf, dpo, pairwise|
-|hh-rlhf-cn-helpful-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|202.2±135.3, min=25, max=1070|rlhf, dpo, pairwise|
-|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|
-|pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)|214670|0|1612.3±8856.2, min=11, max=1208955|text-generation, awq|
-|🔥coig-cqia-chinese-traditional|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1111|0|172.6±59.9, min=55, max=856|general|
-|🔥coig-cqia-coig-pc|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3000|0|353.5±859.6, min=34, max=19288|general|
-|🔥coig-cqia-exam|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|4856|0|275.0±240.0, min=45, max=4932|general|
-|🔥coig-cqia-finance|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|11288|0|1266.4±561.1, min=60, max=10582|general|
-|🔥coig-cqia-douban|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3086|0|402.9±544.7, min=88, max=10870|general|
-|🔥coig-cqia-human-value|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1007|0|151.2±77.3, min=39, max=656|general|
-|🔥coig-cqia-logi-qa|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|421|0|309.8±188.8, min=43, max=1306|general|
-|🔥coig-cqia-ruozhiba|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|240|0|189.8±62.2, min=33, max=505|general|
-|🔥coig-cqia-segmentfault|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|458|0|449.0±495.8, min=87, max=6342|general|
-|🔥coig-cqia-wiki|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|10603|0|619.2±515.8, min=73, max=10140|general|
-|🔥coig-cqia-wikihow|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1485|0|1700.0±790.9, min=260, max=6371|general|
-|🔥coig-cqia-xhs|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1508|0|438.0±179.6, min=129, max=2191|general|
-|🔥coig-cqia-zhihu|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|5631|0|540.7±306.7, min=161, max=3036|general|
-|🔥ruozhiba-post-annual|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|1361|0|36.6±15.3, min=24, max=559|pretrain|
-|🔥ruozhiba-title-good|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|2597|0|41.9±19.3, min=22, max=246|pretrain|
-|🔥ruozhiba-title-norm|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|81700|0|39.9±12.8, min=21, max=386|pretrain|
+| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags | HF Dataset ID |
+| ------------ | ---------- | ---------- | -------- | ----------------- | ---- | ------------- |
+|🔥ms-bench|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|316228|0|345.0±441.3, min=22, max=30960|chat, general, multi-round|-|
+|🔥ms-bench-mini|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|19492|0|353.9±439.4, min=29, max=12078|chat, general, multi-round|-|
+|🔥alpaca-en|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)|52002|0|176.2±125.8, min=26, max=740|chat, general|[vicgalle/alpaca-gpt4](https://huggingface.co/datasets/vicgalle/alpaca-gpt4)|
+|🔥alpaca-zh|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)|48818|0|162.1±93.9, min=26, max=856|chat, general|[c-s-ale/alpaca-gpt4-data-zh](https://huggingface.co/datasets/c-s-ale/alpaca-gpt4-data-zh)|
+|multi-alpaca-all|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary)|131867|0|112.9±50.6, min=26, max=1226|chat, general, multilingual|-|
+|instinwild-en|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|52191|0|160.2±69.7, min=33, max=763|chat, general|-|
+|instinwild-zh|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|51504|0|130.3±45.1, min=28, max=1434|chat, general|-|
+|cot-en|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT/summary)|74771|0|122.7±64.8, min=51, max=8320|chat, general|-|
+|cot-zh|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh/summary)|74771|0|117.5±70.8, min=43, max=9636|chat, general|-|
+|firefly-all-zh|[wyj123456/firefly](https://modelscope.cn/datasets/wyj123456/firefly/summary)|1649399|0|178.1±260.4, min=26, max=12516|chat, general|-|
+|instruct-en|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct/summary)|888970|0|268.9±331.2, min=26, max=7252|chat, general|-|
+|gpt4all-en|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all/summary)|806199|0|302.5±384.1, min=27, max=7391|chat, general|-|
+|sharegpt-en|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|99799|0|1045.7±431.9, min=22, max=7907|chat, general, multi-round|-|
+|sharegpt-zh|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|135399|0|806.3±771.7, min=21, max=65318|chat, general, multi-round|-|
+|tulu-v2-sft-mixture|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary)|326154|0|867.8±996.4, min=22, max=12111|chat, multilingual, general, multi-round|[allenai/tulu-v2-sft-mixture](https://huggingface.co/datasets/allenai/tulu-v2-sft-mixture)|
+|wikipedia-zh|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary)|254547|0|568.4±713.2, min=37, max=78678|text-generation, general, pretrained|[pleisto/wikipedia-cn-20230720-filtered](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)|
+|open-orca|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|3239027|0|360.4±402.9, min=27, max=8672|chat, multilingual, general|-|
+|open-orca-gpt4|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|994896|0|382.3±417.4, min=31, max=8740|chat, multilingual, general|-|
+|sharegpt-gpt4|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|103063|0|1286.2±2089.4, min=22, max=221080|chat, multilingual, general, multi-round|-|
+|🔥sharegpt-gpt4-mini|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|6205|0|3511.6±6068.5, min=33, max=116018|chat, multilingual, general, multi-round, gpt4|-|
+|🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)|30000|0|647.7±217.1, min=199, max=2722|chat, agent, multi-round|-|
+|ms-agent-for-agentfabric-default|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|30000|0|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-|
+|ms-agent-for-agentfabric-addition|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|488|0|2084.9±1514.8, min=489, max=7354|chat, agent, multi-round|-|
+|damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|422115|161|965.7±440.9, min=321, max=31535|chat, agent, multi-round|-|
+|damo-agent-mini-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|39964|152|1230.9±350.1, min=558, max=4982|chat, agent, multi-round|-|
+|agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|1866|0|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-|
+|code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)|20016|0|100.1±60.1, min=29, max=1776|chat, coding|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)|
+|🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)|2359|0|723.8±233.5, min=259, max=2117|chat, coding|-|
+|🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)|27224|0|483.6±193.9, min=45, max=3082|chat, coding|-|
+|🔥codefuse-evol-instruction-zh|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)|66862|0|439.6±206.3, min=37, max=2983|chat, coding|-|
+|medical-en|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|117117|500|257.4±89.1, min=36, max=2564|chat, medical|-|
+|medical-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|1950472|500|167.2±219.7, min=26, max=27351|chat, medical|-|
+|medical-mini-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|50000|500|168.1±220.8, min=26, max=12320|chat, medical|-|
+|🔥disc-med-sft-zh|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT/summary)|441767|0|354.1±193.1, min=25, max=2231|chat, medical|[Flmc/DISC-Med-SFT](https://huggingface.co/datasets/Flmc/DISC-Med-SFT)|
+|lawyer-llama-zh|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary)|21476|0|194.4±91.7, min=27, max=924|chat, law|[Skepsun/lawyer_llama_data](https://huggingface.co/datasets/Skepsun/lawyer_llama_data)|
+|tigerbot-law-zh|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)|55895|0|109.9±126.4, min=37, max=18878|text-generation, law, pretrained|[TigerResearch/tigerbot-law-plugin](https://huggingface.co/datasets/TigerResearch/tigerbot-law-plugin)|
+|🔥disc-law-sft-zh|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT/summary)|166758|0|533.7±495.4, min=30, max=15169|chat, law|-|
+|🔥blossom-math-zh|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary)|10000|0|169.3±58.7, min=35, max=563|chat, math|[Azure99/blossom-math-v2](https://huggingface.co/datasets/Azure99/blossom-math-v2)|
+|school-math-zh|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)|248480|0|157.6±72.1, min=33, max=3450|chat, math|[BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)|
+|open-platypus-en|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus/summary)|24926|0|367.9±254.8, min=30, max=3951|chat, math|[garage-bAInd/Open-Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)|
+|text2sql-en|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary)|25000|0|274.6±326.4, min=38, max=1975|chat, sql|[Clinton/texttosqlv2_25000_v2](https://huggingface.co/datasets/Clinton/texttosqlv2_25000_v2)|
+|🔥sql-create-context-en|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)|78577|0|80.2±17.8, min=36, max=456|chat, sql|[b-mc2/sql-create-context](https://huggingface.co/datasets/b-mc2/sql-create-context)|
+|🔥advertise-gen-zh|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)|97484|915|131.6±21.7, min=52, max=242|text-generation|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)|
+|🔥dureader-robust-zh|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)|15937|1962|242.1±137.4, min=61, max=1417|text-generation|-|
+|cmnli-zh|[clue](https://modelscope.cn/datasets/clue/summary)|391783|12241|83.6±16.6, min=52, max=200|text-generation, classification|[clue](https://huggingface.co/datasets/clue)|
+|🔥cmnli-mini-zh|[clue](https://modelscope.cn/datasets/clue/summary)|20000|200|82.9±16.3, min=52, max=188|text-generation, classification|[clue](https://huggingface.co/datasets/clue)|
+|🔥jd-sentiment-zh|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)|45012|4988|67.0±83.2, min=40, max=4040|text-generation, classification|-|
+|🔥hc3-zh|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese/summary)|39781|0|177.8±81.5, min=58, max=3052|text-generation, classification|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)|
+|🔥hc3-en|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3/summary)|11021|0|299.3±138.7, min=66, max=2268|text-generation, classification|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)|
+|finance-en|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en/summary)|68911|0|135.6±134.3, min=26, max=3525|chat, financial|[ssbuild/alpaca_finance_en](https://huggingface.co/datasets/ssbuild/alpaca_finance_en)|
+|poetry-zh|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary)|388599|1710|55.2±9.4, min=23, max=83|text-generation, poetry|-|
+|webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)|50000|0|1478.9±11526.1, min=100, max=490484|chat, novel|[zxbsmk/webnovel_cn](https://huggingface.co/datasets/zxbsmk/webnovel_cn)|
+|generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)|396004|0|273.3±52.0, min=32, max=873|chat, character-dialogue|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|
+|cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)|4959|0|3234.4±2547.5, min=91, max=19548|chat, classification|-|
+|ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner|-|
+|long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)|11998|0|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)|
+|coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision|-|
+|🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision|-|
+|🔥coco-mini-en-2|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|36.8±2.8, min=32, max=77|chat, multi-modal, vision|-|
+|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision|-|
+|aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio|-|
+|🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio|-|
+|hh-rlhf-harmless-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42462|2308|167.2±123.1, min=22, max=986|rlhf, dpo, pairwise|-|
+|hh-rlhf-helpful-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|43777|2348|201.9±135.2, min=25, max=1070|rlhf, dpo, pairwise|-|
+|hh-rlhf-helpful-online|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|10150|1137|401.5±278.7, min=32, max=1987|rlhf, dpo, pairwise|-|
+|hh-rlhf-helpful-rejection-sampled|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|-|
+|hh-rlhf-red-team-attempts|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|-|
+|🔥hh-rlhf-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|172085|9292|172.8±124.0, min=22, max=1638|rlhf, dpo, pairwise|-|
+|hh-rlhf-cn-harmless-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|143.9±109.4, min=24, max=3078|rlhf, dpo, pairwise|-|
+|hh-rlhf-cn-helpful-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|176.8±120.0, min=26, max=1420|rlhf, dpo, pairwise|-|
+|hh-rlhf-cn-harmless-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|167.5±123.2, min=22, max=986|rlhf, dpo, pairwise|-|
+|hh-rlhf-cn-helpful-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|202.2±135.3, min=25, max=1070|rlhf, dpo, pairwise|-|
+|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|-|
+|pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)|214670|0|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
+|🔥coig-cqia-chinese-traditional|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1111|0|172.6±59.9, min=55, max=856|general|-|
+|🔥coig-cqia-coig-pc|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3000|0|353.5±859.6, min=34, max=19288|general|-|
+|🔥coig-cqia-exam|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|4856|0|275.0±240.0, min=45, max=4932|general|-|
+|🔥coig-cqia-finance|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|11288|0|1266.4±561.1, min=60, max=10582|general|-|
+|🔥coig-cqia-douban|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3086|0|402.9±544.7, min=88, max=10870|general|-|
+|🔥coig-cqia-human-value|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1007|0|151.2±77.3, min=39, max=656|general|-|
+|🔥coig-cqia-logi-qa|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|421|0|309.8±188.8, min=43, max=1306|general|-|
+|🔥coig-cqia-ruozhiba|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|240|0|189.8±62.2, min=33, max=505|general|-|
+|🔥coig-cqia-segmentfault|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|458|0|449.0±495.8, min=87, max=6342|general|-|
+|🔥coig-cqia-wiki|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|10603|0|619.2±515.8, min=73, max=10140|general|-|
+|🔥coig-cqia-wikihow|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1485|0|1700.0±790.9, min=260, max=6371|general|-|
+|🔥coig-cqia-xhs|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1508|0|438.0±179.6, min=129, max=2191|general|-|
+|🔥coig-cqia-zhihu|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|5631|0|540.7±306.7, min=161, max=3036|general|-|
+|🔥ruozhiba-post-annual|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|1361|0|36.6±15.3, min=24, max=559|pretrain|-|
+|🔥ruozhiba-title-good|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|2597|0|41.9±19.3, min=22, max=246|pretrain|-|
+|🔥ruozhiba-title-norm|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|81700|0|39.9±12.8, min=21, max=386|pretrain|-|
diff --git a/scripts/utils/run_dataset_info.py b/scripts/utils/run_dataset_info.py
index a867bf2600..9255bfaad2 100644
--- a/scripts/utils/run_dataset_info.py
+++ b/scripts/utils/run_dataset_info.py
@@ -30,10 +30,10 @@ def write_dataset_info() -> None:
     res_text_list = []
 
     res_text_list.append(
-        '| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags |'
+        '| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags | HF Dataset ID |'
     )
     res_text_list.append(
-        '| ------------ | ---------- | ---------- | -------- | ----------------- | ---- |'
+        '| ------------ | ---------- | ---------- | -------- | ----------------- | ---- | ------------- |'
     )
     if len(text_list) >= 2:
         text_list = text_list[2:]
@@ -88,7 +88,7 @@ def write_dataset_info() -> None:
                 _token_len.append(len(input_ids[i]))
             stat = stat_array(_token_len)[0]
             stat_str = f"{stat['mean']:.1f}±{stat['std']:.1f}, min={stat['min']}, max={stat['max']}"
-        url = f"https://modelscope.cn/datasets/{dataset_info['dataset_id_or_path']}/summary"
+        ms_url = f"https://modelscope.cn/datasets/{dataset_info['dataset_id_or_path']}/summary"
 
         if '🔥' in tags:
             tags.remove('🔥')
@@ -96,9 +96,17 @@ def write_dataset_info() -> None:
         tags_str = ', '.join(tags)
         if len(tags_str) == 0:
             tags_str = '-'
+        hf_dataset_id = dataset_info.get('hf_dataset_id')
+        if hf_dataset_id is None:
+            hf_dataset_id = '-'
+            hf_dataset_id_str = '-'
+        else:
+            hf_url = f'https://huggingface.co/datasets/{hf_dataset_id}'
+            hf_dataset_id_str = f'[{hf_dataset_id}]({hf_url})'
+
         res_text_list.append(
-            f"|{dataset_name}|[{dataset_info['dataset_id_or_path']}]({url})|{train_size}|"
-            f'{val_size}|{stat_str}|{tags_str}|')
+            f"|{dataset_name}|[{dataset_info['dataset_id_or_path']}]({ms_url})|{train_size}|"
+            f'{val_size}|{stat_str}|{tags_str}|{hf_dataset_id_str}|')
     print(f'数据集总数: {len(dataset_name_list)}')
 
     for idx in range(len(fpaths)):
diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py
index 63a636d49c..09a7f6b49f 100644
--- a/scripts/utils/run_model_info.py
+++ b/scripts/utils/run_model_info.py
@@ -12,9 +12,10 @@ def get_model_info_table() -> List[str]:
     model_name_list = ModelType.get_model_name_list()
     result = (
         '| Model Type | Model ID | Default Lora Target Modules | Default Template |'
-        ' Support Flash Attn | Support VLLM | Requires | Tags |\n'
+        ' Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |\n'
         '| ---------  | -------- | --------------------------- | ---------------- |'
-        ' ------------------ | ------------ | -------- | ---- |\n')
+        ' ------------------ | ------------ | -------- | ---- | ----------- |\n'
+    )
     res: List[str] = []
     bool_mapping = {True: '&#x2714;', False: '&#x2718;'}
     for model_name in model_name_list:
@@ -31,15 +32,23 @@ def get_model_info_table() -> List[str]:
         tags_str = ', '.join(tags)
         if len(tags_str) == 0:
             tags_str = '-'
+        hf_model_id = model_info.get('hf_model_id')
+        if hf_model_id is None:
+            hf_model_id = '-'
         r = [
             model_name, model_id, lora_target_modules, template,
-            support_flash_attn, support_vllm, requires, tags_str
+            support_flash_attn, support_vllm, requires, tags_str, hf_model_id
         ]
         res.append(r)
     text = ''
     for r in res:
-        url = f'https://modelscope.cn/models/{r[1]}/summary'
-        text += f'|{r[0]}|[{r[1]}]({url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|{r[7]}|\n'
+        ms_url = f'https://modelscope.cn/models/{r[1]}/summary'
+        if r[8] != '-':
+            hf_url = f'https://huggingface.co/{r[8]}'
+            hf_model_id_str = f'[{r[8]}]({hf_url})'
+        else:
+            hf_model_id_str = '-'
+        text += f'|{r[0]}|[{r[1]}]({ms_url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|{r[7]}|{hf_model_id_str}|\n'
     print(f'模型总数: {len(res)}')
     result += text
     for idx, fpath in enumerate(fpaths):
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 9fab7662bd..72b0d57b2e 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -274,7 +274,8 @@ def set_model_type(self: Union['SftArguments', 'InferArguments']) -> None:
             model_info['revision'] = 'main'
         self.model_revision = model_info['revision']
         if self.model_id_or_path is None:
-            self.model_id_or_path = model_info['model_id_or_path']
+            self.model_id_or_path = model_info[
+                'hf_model_id'] if use_hf else model_info['model_id_or_path']
         requires = model_info['requires']
         for require in requires:
             require_version(require)
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 9860aa7e27..6ca45e74f9 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -1554,10 +1554,16 @@ def get_dataset(
     for dataset_name in dataset_name_list:
         dataset_info = DATASET_MAPPING[dataset_name]
         use_hf = strtobool(os.environ.get('USE_HF', 'False'))
+        dataset_str_f = 'Downloading the dataset from {hub}, dataset_id: {dataset_id}'
         if use_hf:
             dataset_id_or_path = dataset_info['hf_dataset_id']
+            dataset_str = dataset_str_f.format(
+                hub='HuggingFace', dataset_id=dataset_id_or_path)
         else:
             dataset_id_or_path = dataset_info['dataset_id_or_path']
+            dataset_str = dataset_str_f.format(
+                hub='ModelScope', dataset_id=dataset_id_or_path)
+        logger.info(dataset_str)
         assert dataset_id_or_path is not None, (
             f'dataset_name: {dataset_name}, use_hf: {use_hf}, '
             f'dataset_id_or_path: {dataset_id_or_path}.')
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 7d5b667233..90a24dc54b 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -1719,6 +1719,14 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     requires=['transformers>=4.37'],
     hf_model_id='Qwen/Qwen1.5-MoE-A2.7B-Chat')
+@register_model(
+    ModelType.codeqwen1half_7b_chat,
+    'qwen/CodeQwen1.5-7B-Chat',
+    LoRATM.qwen1half,
+    TemplateType.qwen,
+    support_flash_attn=True,
+    support_vllm=True,
+    requires=['transformers>=4.37'])
 def get_model_tokenizer_qwen1half(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],

From 91f28f4373c0c236d70e944ece1baf3a51efcf0e Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 02:37:27 +0800
Subject: [PATCH 16/26] support awq gptq

---
 swift/llm/infer.py          |   4 +-
 swift/llm/utils/argument.py |  24 +++---
 swift/llm/utils/model.py    | 164 ++++++++++++++++++++++--------------
 swift/llm/utils/template.py |   6 ++
 4 files changed, 121 insertions(+), 77 deletions(-)

diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index e34177a2ad..24f998b896 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -84,8 +84,8 @@ def merge_lora(args: InferArguments,
     logger.info(f'replace_if_exists: {replace_if_exists}')
     assert args.ckpt_dir is not None, 'args.ckpt_dir is not specified.'
     assert args.sft_type == 'lora', "Only supports sft_type == 'lora'"
-    assert 'int4' not in args.model_type, 'int4 model is not supported'
-    assert 'int8' not in args.model_type, 'int8 model is not supported'
+    for s in ['int4', 'int8', 'awq']:
+        assert s not in args.model_type, f'{s} model is not supported'
     if args.quantization_bit != 0:
         logger.warning('It is not recommended to merge quantized models, '
                        'as this can result in performance degradation')
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 72b0d57b2e..7e4bb2380c 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -118,18 +118,18 @@ def select_dtype(
             else:
                 raise ValueError(f'args.dtype: {self.dtype}')
         # cuda, npu
-        if self.dtype == 'AUTO' and not is_torch_bf16_gpu_available():
-            self.dtype = 'fp16'
-        if self.dtype == 'AUTO' and ('int4' in self.model_type
-                                     or 'int8' in self.model_type):
-            model_torch_dtype = MODEL_MAPPING[self.model_type]['torch_dtype']
-            if model_torch_dtype is not None:
-                self.dtype = dtype_mapping[model_torch_dtype]
         if self.dtype == 'AUTO':
-            if isinstance(self, SftArguments):
-                self.dtype = 'bf16'
+            if is_torch_bf16_gpu_available():
+                self.dtype = 'fp16'
             else:
-                return None, False, False
+                model_torch_dtype = MODEL_MAPPING[self.model_type].get(
+                    'torch_dtype')
+                if model_torch_dtype is not None:
+                    self.dtype = dtype_mapping[model_torch_dtype]
+                elif isinstance(self, SftArguments):
+                    self.dtype = 'bf16'
+                else:
+                    return None, False, False
 
         torch_dtype = dtype_mapping_reversed[self.dtype]
 
@@ -661,8 +661,8 @@ def __post_init__(self) -> None:
             assert len(self.additional_trainable_parameters) == 0, (
                 'lora does not support `additional_trainable_parameters`, please set `--sft_type full`'
             )
-            if 'int4' in self.model_type or 'int8' in self.model_type:
-                assert self.quantization_bit == 0, 'int4 and int8 models do not need to be quantized again.'
+            if 'int4' in self.model_type or 'int8' in self.model_type or 'awq' in self.model_type:
+                assert self.quantization_bit == 0, 'int4, int8 or awq models do not need to be quantized again.'
             if self.learning_rate is None:
                 self.learning_rate = 1e-4
             if self.save_only_model is None:
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 90a24dc54b..a0c4b87385 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -216,6 +216,9 @@ class ModelType:
     mixtral_moe_7b_instruct = 'mixtral-moe-7b-instruct'
     mixtral_moe_7b_aqlm_2bit_1x16 = 'mixtral-moe-7b-aqlm-2bit-1x16'  # aqlm
     mixtral_moe_8x22b_v1 = 'mixtral-moe-8x22b-v1'
+    # wizardlm
+    wizardlm2_7b_awq = 'wizardlm2-7b-awq'
+    wizardlm2_8x22b = 'wizardlm2-8x22b'
     # baichuan
     baichuan_7b = 'baichuan-7b'
     baichuan_13b = 'baichuan-13b'
@@ -413,6 +416,44 @@ def _register_model(
     return _register_model
 
 
+def _check_awq_ext() -> None:
+    try:
+        from awq.utils.packing_utils import dequantize_gemm
+        import awq_ext  # with CUDA kernels (AutoAWQ_kernels)
+    except ImportError as e:
+        raise ImportError(
+            'You are training awq models, remember installing awq_ext by '
+            '`git clone https://github.com/casper-hansen/AutoAWQ_kernels '
+            '&& cd AutoAWQ_kernels && pip install -e .`') from e
+
+
+def _check_gptq_model(bits: int, model_kwargs: Dict[str, Any]) -> None:
+    assert model_kwargs.get('quantization_config') is None
+    if version.parse(transformers.__version__) >= version.parse('4.35'):
+        model_kwargs['quantization_config'] = GPTQConfig(
+            bits=bits, use_exllama=False)
+    else:
+        model_kwargs['quantization_config'] = GPTQConfig(
+            bits=bits, disable_exllama=True)
+
+    # fix quantlinear bug
+    from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear
+    __old_forward = QuantLinear.forward
+
+    def _new_forward(self, x):
+        if not self.training or not self.autogptq_cuda_available:
+            return self.__old_forward(x)
+        # fix sft no grad
+        self.autogptq_cuda_available = False
+        res = self.__old_forward(x)
+        self.autogptq_cuda_available = True
+        return res
+
+    if not hasattr(QuantLinear, '__old_forward'):  # avoid double patching
+        QuantLinear.__old_forward = __old_forward
+        QuantLinear.forward = _new_forward
+
+
 @register_model(
     ModelType.internlm_20b,
     'Shanghai_AI_Laboratory/internlm-20b',
@@ -564,6 +605,13 @@ def get_model_tokenizer_from_repo(model_dir: str,
                                   automodel_class=AutoModelForCausalLM,
                                   **kwargs):
     """load from an independent repository"""
+    is_awq = kwargs.pop('is_awq', False)
+    gptq_bits = kwargs.pop('gptq_bits', 0)
+    is_training = kwargs.pop('is_training', False)
+    if is_awq and is_training:
+        _check_awq_ext()
+    if gptq_bits > 0 and is_training:
+        _check_gptq_model(gptq_bits, model_kwargs)
     if model_config is None:
         model_config = AutoConfig.from_pretrained(
             model_dir, trust_remote_code=True)
@@ -585,6 +633,10 @@ def get_model_tokenizer_from_repo(model_dir: str,
                 torch_dtype=torch_dtype,
                 trust_remote_code=True,
                 **model_kwargs)
+    if is_awq:
+        model.is_awq = is_awq
+    if gptq_bits > 0:
+        model.gptq_bits = gptq_bits
     return model, tokenizer
 
 
@@ -1026,6 +1078,17 @@ def cross_entropy_forward(self, inputs: Tensor,
     return model, tokenizer
 
 
+@register_model(
+    ModelType.wizardlm2_7b_awq,
+    'AI-ModelScope/WizardLM-2-7B-AWQ',
+    LoRATM.llama2,
+    TemplateType.wizardlm2_awq,
+    requires=['transformers>=4.34'],
+    torch_dtype=torch.float16,
+    support_flash_attn=True,
+    support_vllm=True,
+    function_kwargs={'is_awq': True},
+    hf_model_id='MaziyarPanahi/WizardLM-2-7B-AWQ')
 @register_model(
     ModelType.gemma_2b,
     'AI-ModelScope/gemma-2b',
@@ -1163,7 +1226,8 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.default_generation,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/CodeQwen1.5-7B')
 @register_model(
     ModelType.qwen1half_moe_a2_7b,
     'qwen/Qwen1.5-MoE-A2.7B',
@@ -1577,6 +1641,7 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
     hf_model_id='Qwen/Qwen1.5-0.5B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_1_8b_chat_awq,
@@ -1587,6 +1652,7 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
     hf_model_id='Qwen/Qwen1.5-1.8B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_4b_chat_awq,
@@ -1597,6 +1663,7 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
     hf_model_id='Qwen/Qwen1.5-4B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_7b_chat_awq,
@@ -1607,6 +1674,7 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
     hf_model_id='Qwen/Qwen1.5-7B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_14b_chat_awq,
@@ -1617,6 +1685,7 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
     hf_model_id='Qwen/Qwen1.5-14B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_32b_chat_awq,
@@ -1627,6 +1696,7 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
     hf_model_id='Qwen/Qwen1.5-32B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_72b_chat_awq,
@@ -1637,6 +1707,7 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_vllm=True,
     function_kwargs={'is_awq': True},
     requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
     hf_model_id='Qwen/Qwen1.5-72B-Chat-AWQ')
 @register_model(
     ModelType.codeqwen1half_7b_chat_awq,
@@ -1646,7 +1717,9 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     function_kwargs={'is_awq': True},
-    requires=['transformers>=4.37', 'autoawq'])
+    requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
+    hf_model_id='Qwen/CodeQwen1.5-7B-Chat-AWQ')
 @register_model(
     ModelType.qwen1half_0_5b_chat,
     'qwen/Qwen1.5-0.5B-Chat',
@@ -1726,24 +1799,14 @@ def get_model_tokenizer_aqlm(model_dir: str,
     TemplateType.qwen,
     support_flash_attn=True,
     support_vllm=True,
-    requires=['transformers>=4.37'])
+    requires=['transformers>=4.37'],
+    hf_model_id='Qwen/CodeQwen1.5-7B-Chat')
 def get_model_tokenizer_qwen1half(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
                                   load_model: bool = True,
                                   **kwargs):
-    is_awq = kwargs.pop('is_awq', False)
-    is_training = kwargs.pop('is_training', False)
     kwargs['eos_token'] = '<|im_end|>'
-    if is_awq and is_training:
-        try:
-            from awq.utils.packing_utils import dequantize_gemm
-            import awq_ext  # with CUDA kernels (AutoAWQ_kernels)
-        except ImportError as e:
-            raise ImportError(
-                'You are training awq models, remember installing awq_ext by '
-                '`git clone https://github.com/casper-hansen/AutoAWQ_kernels '
-                '&& cd AutoAWQ_kernels && pip install -e .`') from e
     return get_model_tokenizer_with_flash_attn(model_dir, torch_dtype,
                                                model_kwargs, load_model,
                                                **kwargs)
@@ -1756,7 +1819,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4')
@@ -1767,7 +1830,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8')
 @register_model(
@@ -1777,7 +1840,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4')
@@ -1788,7 +1851,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8')
 @register_model(
@@ -1798,7 +1861,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen1.5-4B-Chat-GPTQ-Int4')
@@ -1809,7 +1872,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen1.5-4B-Chat-GPTQ-Int8')
 @register_model(
@@ -1819,7 +1882,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen1.5-7B-Chat-GPTQ-Int4')
@@ -1830,7 +1893,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen1.5-7B-Chat-GPTQ-Int8')
 @register_model(
@@ -1840,7 +1903,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen1.5-14B-Chat-GPTQ-Int4')
@@ -1851,7 +1914,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen1.5-14B-Chat-GPTQ-Int8')
 @register_model(
@@ -1861,7 +1924,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen1.5-32B-Chat-GPTQ-Int4')
@@ -1872,7 +1935,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen1.5-72B-Chat-GPTQ-Int4')
@@ -1883,7 +1946,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen1.5-72B-Chat-GPTQ-Int8')
 @register_model(
@@ -1893,7 +1956,7 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5', 'transformers>=4.37'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4')
 def get_model_tokenizer_qwen1half_intx(model_dir: str,
@@ -2648,7 +2711,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen-1_8B-Chat-Int8')
 @register_model(
@@ -2658,7 +2721,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen-1_8B-Chat-Int4')
@@ -2669,7 +2732,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen-72B-Chat-Int8')
 @register_model(
@@ -2679,7 +2742,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen-72B-Chat-Int4')
@@ -2690,7 +2753,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     tags=['financial'],
@@ -2704,7 +2767,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={
         'get_qwen_function': get_model_tokenizer_qwen_vl,
-        'bits': 4
+        'gptq_bits': 4
     },
     support_flash_attn=True,
     tags=['multi-modal', 'vision'],
@@ -2716,7 +2779,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen-14B-Chat-Int8')
 @register_model(
@@ -2726,7 +2789,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 8},
+    function_kwargs={'gptq_bits': 8},
     support_flash_attn=True,
     hf_model_id='Qwen/Qwen-7B-Chat-Int8')
 @register_model(
@@ -2736,7 +2799,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen-14B-Chat-Int4')
@@ -2747,7 +2810,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.qwen,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    function_kwargs={'bits': 4},
+    function_kwargs={'gptq_bits': 4},
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='Qwen/Qwen-7B-Chat-Int4')
@@ -2756,31 +2819,6 @@ def get_model_tokenizer_qwen_intx(model_dir: str,
                                   model_kwargs: Dict[str, Any],
                                   load_model: bool = True,
                                   **kwargs):
-    logger.info('use gptq, ignore bnb arguments')
-    bits = kwargs.pop('bits')
-    if version.parse(transformers.__version__) >= version.parse('4.35'):
-        model_kwargs['quantization_config'] = GPTQConfig(
-            bits=bits, use_exllama=False)
-    else:
-        model_kwargs['quantization_config'] = GPTQConfig(
-            bits=bits, disable_exllama=True)
-
-    # fix quantlinear bug
-    from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear
-    __old_forward = QuantLinear.forward
-
-    def _new_forward(self, x):
-        if not self.training or not self.autogptq_cuda_available:
-            return self.__old_forward(x)
-        # fix sft no grad
-        self.autogptq_cuda_available = False
-        res = self.__old_forward(x)
-        self.autogptq_cuda_available = True
-        return res
-
-    if not hasattr(QuantLinear, '__old_forward'):  # avoid double patching
-        QuantLinear.__old_forward = __old_forward
-        QuantLinear.forward = _new_forward
     get_qwen_function = kwargs.pop('get_qwen_function',
                                    get_model_tokenizer_qwen_chat)
     model, tokenizer = get_qwen_function(model_dir, torch_dtype, model_kwargs,
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 1c7a608636..8d03bbc3f8 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -61,6 +61,7 @@ class TemplateType:
     minicpm_v = 'minicpm-v'
     gemma = 'gemma'
     mplug_owl2 = 'mplug-owl2'
+    wizardlm2_awq = 'wizardlm2-awq'
     # compatibility. (Deprecated)
     chatml = 'chatml'
     telechat = 'telechat'
@@ -1363,6 +1364,11 @@ def data_collator(self,
     use_model=True,
     lazy_tokenize=True)
 
+register_template(
+    TemplateType.wizardlm2_awq,
+    Template(['{{SYSTEM}}'], ['User:\n{{QUERY}}\n\nAssistant:\n'], ['\n\n'],
+             ['</s>']))
+
 
 def get_template(
     template_type: str,

From e451533f7dd153bbe02a3af04fd16d323898755b Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 03:22:33 +0800
Subject: [PATCH 17/26] support yi-awq, yi-gptq

---
 swift/llm/utils/model.py | 58 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index a0c4b87385..adb6be39c1 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -131,11 +131,15 @@ class ModelType:
     yi_6b = 'yi-6b'
     yi_6b_200k = 'yi-6b-200k'
     yi_6b_chat = 'yi-6b-chat'
+    yi_6b_chat_awq = 'yi-6b-chat-awq'
+    yi_6b_chat_int8 = 'yi-6b-chat-int8'
     yi_9b = 'yi-9b'
     yi_9b_200k = 'yi-9b-200k'
     yi_34b = 'yi-34b'
     yi_34b_200k = 'yi-34b-200k'
     yi_34b_chat = 'yi-34b-chat'
+    yi_34b_chat_awq = 'yi-34b-chat-awq'
+    yi_34b_chat_int8 = 'yi-34b-chat-int8'
     # yi-vl
     yi_vl_6b_chat = 'yi-vl-6b-chat'
     yi_vl_34b_chat = 'yi-vl-34b-chat'
@@ -633,9 +637,9 @@ def get_model_tokenizer_from_repo(model_dir: str,
                 torch_dtype=torch_dtype,
                 trust_remote_code=True,
                 **model_kwargs)
-    if is_awq:
+    if load_model and is_awq:
         model.is_awq = is_awq
-    if gptq_bits > 0:
+    if load_model and gptq_bits > 0:
         model.gptq_bits = gptq_bits
     return model, tokenizer
 
@@ -1369,6 +1373,30 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='01-ai/Yi-6B-Chat')
+@register_model(
+    ModelType.yi_6b_chat_awq,
+    '01ai/Yi-6B-Chat-4bits',
+    LoRATM.llama2,
+    TemplateType.yi,
+    eos_token='<|im_end|>',
+    requires=['autoawq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'is_awq': True},
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-6B-Chat-4bits')
+@register_model(
+    ModelType.yi_6b_chat_int8,
+    '01ai/Yi-6B-Chat-8bits',
+    LoRATM.llama2,
+    TemplateType.yi,
+    eos_token='<|im_end|>',
+    requires=['auto_gptq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'gptq_bits': 8},
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-6B-Chat-8bits')
 @register_model(
     ModelType.yi_34b_chat,
     '01ai/Yi-34B-Chat',
@@ -1378,6 +1406,30 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True,
     hf_model_id='01-ai/Yi-34B-Chat')
+@register_model(
+    ModelType.yi_34b_chat_awq,
+    '01ai/Yi-34B-Chat-4bits',
+    LoRATM.llama2,
+    TemplateType.yi,
+    eos_token='<|im_end|>',
+    requires=['autoawq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'is_awq': True},
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-34B-Chat-4bits')
+@register_model(
+    ModelType.yi_34b_chat_int8,
+    '01ai/Yi-34B-Chat-8bits',
+    LoRATM.llama2,
+    TemplateType.yi,
+    eos_token='<|im_end|>',
+    requires=['auto_gptq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'gptq_bits': 8},
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='01-ai/Yi-34B-Chat-8bits')
 @register_model(
     ModelType.yi_34b_200k,
     '01ai/Yi-34B-200K',
@@ -2925,7 +2977,7 @@ def get_model_tokenizer_telechat(model_dir: str,
                                  **kwargs):
     if torch_dtype == torch.bfloat16:
         logger.info(
-            'telechat-7b does not support the bfl16 dtype; the dtype is converted to fp16.'
+            'telechat-7b does not support the bf16 dtype; the dtype is converted to fp16.'
         )
         torch_dtype = torch.float16
     model_config = AutoConfig.from_pretrained(

From 7258e11b317eb30d40c82f4896c16ffb7d797243 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 10:59:32 +0800
Subject: [PATCH 18/26] support wizardlm2_8x22b zero3-offload docs

---
 ...37\346\200\201\345\205\274\345\256\271.md" | 27 ++++++++
 swift/llm/ds_config/zero3_offload.json        | 62 +++++++++++++++++++
 swift/llm/utils/argument.py                   | 18 +++---
 swift/llm/utils/model.py                      |  9 +++
 swift/llm/utils/template.py                   | 10 +++
 5 files changed, 119 insertions(+), 7 deletions(-)
 create mode 100644 "docs/source/LLM/HuggingFace\347\224\237\346\200\201\345\205\274\345\256\271.md"
 create mode 100644 swift/llm/ds_config/zero3_offload.json

diff --git "a/docs/source/LLM/HuggingFace\347\224\237\346\200\201\345\205\274\345\256\271.md" "b/docs/source/LLM/HuggingFace\347\224\237\346\200\201\345\205\274\345\256\271.md"
new file mode 100644
index 0000000000..6c184361b0
--- /dev/null
+++ "b/docs/source/LLM/HuggingFace\347\224\237\346\200\201\345\205\274\345\256\271.md"
@@ -0,0 +1,27 @@
+# HuggingFace生态兼容
+默认我们会使用[ModelScope](https://modelscope.cn/my/overview)中的模型和数据集进行微调和推理。但是考虑到海外用户更熟悉[HuggingFace](https://huggingface.co/)生态，这里对其进行兼容。
+
+你需要设置环境变量`USE_HF=1`，支持的HuggingFace模型和数据集可以参考[支持的模型和数据集](支持的模型和数据集.md)，部分数据集只支持在ModelScope环境下使用。
+
+以下是对`qwen1.5-7b-chat`的推理脚本:
+```shell
+# Experimental Environment: A10, 3090, V100
+USE_HF=1 CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat
+```
+
+微调脚本:
+```shell
+# Experimental Environment: 2 * A100
+# GPU Memory Requirement: 2 * 30GB
+USE_HF=1 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+    --model_type qwen1half-7b-chat \
+    --dataset blossom-math-zh \
+    --num_train_epochs 5 \
+    --sft_type lora \
+    --output_dir output \
+```
+
+微调后推理与部署等内容参考其他文档.
diff --git a/swift/llm/ds_config/zero3_offload.json b/swift/llm/ds_config/zero3_offload.json
new file mode 100644
index 0000000000..8374e5b0d3
--- /dev/null
+++ b/swift/llm/ds_config/zero3_offload.json
@@ -0,0 +1,62 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "bf16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "total_num_steps": "auto",
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 7e4bb2380c..208f015619 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -400,7 +400,7 @@ class SftArguments(ArgumentsBase):
     neftune_backend: Literal['swift', 'transformers'] = None
 
     gradient_checkpointing: Optional[bool] = None
-    # e.g. 'default-zero3', 'default-zero2', 'ds_config/zero2.json'
+    # e.g. 'default-zero3', 'default-zero2', 'ds_config/zero2.json', 'zero3-offload'
     deepspeed: Optional[str] = None
     batch_size: int = 1
     eval_batch_size: Optional[int] = None
@@ -587,12 +587,16 @@ def __post_init__(self) -> None:
         if is_pai_training_job():
             self._handle_pai_compat()
         ds_config_folder = os.path.join(__file__, '..', '..', 'ds_config')
-        if self.deepspeed == 'default-zero2':
-            self.deepspeed = os.path.abspath(
-                os.path.join(ds_config_folder, 'zero2.json'))
-        elif self.deepspeed == 'default-zero3':
-            self.deepspeed = os.path.abspath(
-                os.path.join(ds_config_folder, 'zero3.json'))
+        deepspeed_mapping = {
+            'default-zero2': 'zero2.json',
+            'default-zero3': 'zero3.json',
+            'zero3-offload': 'zero3-offload.json'
+        }
+        for ds_name, ds_config in deepspeed_mapping.items():
+            if self.deepspeed == ds_name:
+                self.deepspeed = os.path.abspath(
+                    os.path.join(ds_config_folder, ds_config))
+                break
 
         self.handle_path()
         self.set_model_type()
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index adb6be39c1..2d523d9640 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -1082,6 +1082,15 @@ def cross_entropy_forward(self, inputs: Tensor,
     return model, tokenizer
 
 
+@register_model(
+    ModelType.wizardlm2_8x22b,
+    'AI-ModelScope/WizardLM-2-8x22B',
+    LoRATM.llama2,
+    TemplateType.wizardlm2,
+    requires=['transformers>=4.36'],
+    support_flash_attn=True,
+    support_vllm=True,
+    hf_model_id='alpindale/WizardLM-2-8x22B')
 @register_model(
     ModelType.wizardlm2_7b_awq,
     'AI-ModelScope/WizardLM-2-7B-AWQ',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 8d03bbc3f8..43503b74ec 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -62,6 +62,7 @@ class TemplateType:
     gemma = 'gemma'
     mplug_owl2 = 'mplug-owl2'
     wizardlm2_awq = 'wizardlm2-awq'
+    wizardlm2 = 'wizardlm2'
     # compatibility. (Deprecated)
     chatml = 'chatml'
     telechat = 'telechat'
@@ -1369,6 +1370,15 @@ def data_collator(self,
     Template(['{{SYSTEM}}'], ['User:\n{{QUERY}}\n\nAssistant:\n'], ['\n\n'],
              ['</s>']))
 
+_wizardlm2_system = (
+    'A chat between a curious user and an artificial intelligence assistant. '
+    'The assistant gives helpful, detailed, and polite answers to the user\'s questions. '
+)
+register_template(
+    TemplateType.wizardlm2,
+    Template(['{{SYSTEM}}'], ['USER: {{QUERY}} ASSISTANT:'], ['</s>'],
+             ['</s>'], _wizardlm2_system))
+
 
 def get_template(
     template_type: str,

From 2c1b466e2da6475fdeb0d827bc8e008107dc0a65 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 11:41:46 +0800
Subject: [PATCH 19/26] update readme

---
 README.md                       |  4 ++--
 README_CN.md                    |  3 ++-
 docs/source/LLM/index.md        |  1 +
 docs/source_en/LLM/Compat-HF.md | 27 +++++++++++++++++++++++++++
 docs/source_en/LLM/index.md     |  1 +
 5 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 docs/source_en/LLM/Compat-HF.md

diff --git a/README.md b/README.md
index baf34b96c2..75fd44b1b7 100644
--- a/README.md
+++ b/README.md
@@ -390,7 +390,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | XVerse                                         | [XVerse series models](https://github.com/xverse-ai)                    | Chinese<br>English    | 7B-65B                                 | base model<br>chat model<br>long text model<br>MoE model                |
 | LLaMA2                                         | [LLaMA2 series models](https://github.com/facebookresearch/llama)       | English            | 7B-70B<br>including quantized versions   | base model<br>chat model                       |
 | Mistral<br>Mixtral                            | [Mistral series models](https://github.com/mistralai/mistral-src)       | English            | 7B-22B     | base model<br>instruct model<br>MoE model                     |
-| YI                                             | [01AI's YI series models](https://github.com/01-ai)                     | Chinese<br>English    | 6B-34B                                 | base model<br>chat model<br>long text model            |
+| YI                                             | [01AI's YI series models](https://github.com/01-ai)                     | Chinese<br>English    | 6B-34B<br>including quantized             | base model<br>chat model<br>long text model            |
 | InternLM<br>InternLM2<br>InternLM2-Math              | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B                            | base model<br>chat model<br>math model            |
 | DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math          | [DeepSeek series models](https://github.com/deepseek-ai)       | Chinese<br>English    | 1.3B-67B                               | base model<br>chat model<br>MoE model<br>code model<br>math model |
 | MAMBA                                          | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English          | 130M-2.8B                              | base model                                 |
@@ -413,7 +413,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model<br>chat model  |
 | mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese<br>English | 13B | base model  |
 | c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model  |
-
+| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B | chat model<br>MoE model |
 
 #### MLLMs
 
diff --git a/README_CN.md b/README_CN.md
index 83ce16dc5b..b4ffd93d55 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -388,7 +388,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | XVerse                                              | [元象系列模型](https://github.com/xverse-ai)                 | 中文<br>英文 | 7B-65B                    | base模型<br>chat模型<br>长文本模型<br>MoE模型             |                |
 | LLaMA2                                              | [LLaMA2系列模型](https://github.com/facebookresearch/llama)  | 英文       | 7B-70B<br>包含量化版本      | base模型<br>chat模型                          |
 | Mistral<br>Mixtral                                 | [Mistral系列模型](https://github.com/mistralai/mistral-src)  | 英文       | 7B-8x22B | base模型<br>instruct模型<br>MoE模型             |
-| YI                                                  | [01AI的YI系列模型](https://github.com/01-ai)                 | 中文<br>英文 | 6B-34B                    | base模型<br>chat模型<br>长文本模型                 |
+| YI                                                  | [01AI的YI系列模型](https://github.com/01-ai)                 | 中文<br>英文 | 6B-34B<br>包含量化版本          | base模型<br>chat模型<br>长文本模型                 |
 | InternLM<br>InternLM2<br>InternLM2-Math                   | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文<br>英文 | 1.8B-20B                  | base模型<br>chat模型<br>数学模型                  |
 | DeepSeek<br>DeepSeek-MoE<br>DeepSeek-Coder<br>DeepSeek-Math               | [幻方系列模型](https://github.com/deepseek-ai)               | 中文<br>英文 | 1.3B-67B                  | base模型<br>chat模型<br>MoE模型<br>代码模型<br>数学模型 |
 | MAMBA                                               | [MAMBA时序卷积模型](https://github.com/state-spaces/mamba)   | 英文       | 130M-2.8B                 | base模型                                    |
@@ -411,6 +411,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | dbrx | [databricks](https://github.com/databricks/dbrx) | 英文 | 132B | base模型<br>chat模型  |
 | mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | 中文<br>英文 | 13B | base模型  |
 | c4ai-command-r | [c4ai](https://cohere.com/command) | 多语种 | 35B-104B | chat模型  |
+| WizardLM2 | [WizardLM2系列模型](https://github.com/nlpxucan/WizardLM) | 多语种 | 7B-8x22B | chat模型<br>MoE模型 |
 
 
 #### 多模态大模型
diff --git a/docs/source/LLM/index.md b/docs/source/LLM/index.md
index e87d53d817..8841854d8a 100644
--- a/docs/source/LLM/index.md
+++ b/docs/source/LLM/index.md
@@ -25,3 +25,4 @@
 2. [微调推理的命令行参数](命令行参数.md)
 3. [支持的模型和数据集列表](支持的模型和数据集.md)
 4. [运行速度与显存的Benchmark](Benchmark.md)
+5. [HuggingFace生态兼容](HuggingFace生态兼容.md)
diff --git a/docs/source_en/LLM/Compat-HF.md b/docs/source_en/LLM/Compat-HF.md
new file mode 100644
index 0000000000..bbc8da4534
--- /dev/null
+++ b/docs/source_en/LLM/Compat-HF.md
@@ -0,0 +1,27 @@
+# HuggingFace Eco-compatibility
+By default, we use models and datasets from [ModelScope](https://modelscope.cn/my/overview) for fine-tuning and inference. However, considering that overseas users are more familiar with the [HuggingFace](https://huggingface.co/) ecosystem, we have made it compatible with HuggingFace.
+
+To enable HuggingFace compatibility, you need to set the environment variable `USE_HF=1`. Supported HuggingFace models and datasets can be found in the [Supported Models and Datasets](Supported-models-datasets.md). Note that some datasets are only supported in the ModelScope environment.
+
+Here is an example inference script for qwen1.5-7b-chat:
+```shell
+# Experimental Environment: A10, 3090, V100
+USE_HF=1 CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat
+```
+
+微调脚本:
+```shell
+# Experimental Environment: 2 * A100
+# GPU Memory Requirement: 2 * 30GB
+USE_HF=1 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+    --model_type qwen1half-7b-chat \
+    --dataset blossom-math-zh \
+    --num_train_epochs 5 \
+    --sft_type lora \
+    --output_dir output \
+```
+
+Please refer to other documents for inference after fine-tuning, and deployment .
diff --git a/docs/source_en/LLM/index.md b/docs/source_en/LLM/index.md
index 0a0a79d869..057a29cebc 100644
--- a/docs/source_en/LLM/index.md
+++ b/docs/source_en/LLM/index.md
@@ -26,3 +26,4 @@ Please check: [Multi-Modal Best Practices](../Multi-Modal/index.md)
 2. [Command Line Parameters](Command-line-parameters.md)
 3. [Supported models and datasets](Supported-models-datasets.md)
 4. [Benchmark](Benchmark.md)
+5. [Compatible with the HuggingFace ecosystem](Compat-HF.md)

From 836066e0b416fdaaad42728f30386e6b25a70265 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 11:49:29 +0800
Subject: [PATCH 20/26] update readme

---
 README.md    | 2 +-
 README_CN.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 75fd44b1b7..cecfe652e6 100644
--- a/README.md
+++ b/README.md
@@ -413,7 +413,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model<br>chat model  |
 | mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese<br>English | 13B | base model  |
 | c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model  |
-| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B | chat model<br>MoE model |
+| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B<br>including quantized versions | chat model<br>MoE model |
 
 #### MLLMs
 
diff --git a/README_CN.md b/README_CN.md
index b4ffd93d55..ef720c9267 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -411,7 +411,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | dbrx | [databricks](https://github.com/databricks/dbrx) | 英文 | 132B | base模型<br>chat模型  |
 | mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | 中文<br>英文 | 13B | base模型  |
 | c4ai-command-r | [c4ai](https://cohere.com/command) | 多语种 | 35B-104B | chat模型  |
-| WizardLM2 | [WizardLM2系列模型](https://github.com/nlpxucan/WizardLM) | 多语种 | 7B-8x22B | chat模型<br>MoE模型 |
+| WizardLM2 | [WizardLM2系列模型](https://github.com/nlpxucan/WizardLM) | 多语种 | 7B-8x22B<br>包含量化版本 | chat模型<br>MoE模型 |
 
 
 #### 多模态大模型

From 2972df92f18a309f63a2dce059f17887d158bad5 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 13:39:59 +0800
Subject: [PATCH 21/26] update

---
 swift/llm/utils/argument.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 208f015619..11c0c7dd25 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -590,7 +590,7 @@ def __post_init__(self) -> None:
         deepspeed_mapping = {
             'default-zero2': 'zero2.json',
             'default-zero3': 'zero3.json',
-            'zero3-offload': 'zero3-offload.json'
+            'zero3-offload': 'zero3_offload.json'
         }
         for ds_name, ds_config in deepspeed_mapping.items():
             if self.deepspeed == ds_name:

From 1ad7e3613940b9afeec89b985ba1825e75eb0db0 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 13:44:42 +0800
Subject: [PATCH 22/26] update  readme

---
 README.md    | 17 +++++++++++++++++
 README_CN.md | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/README.md b/README.md
index cecfe652e6..7e565f99e9 100644
--- a/README.md
+++ b/README.md
@@ -318,6 +318,23 @@ swift sft \
     --deepspeed default-zero3 \
 ```
 
+ZeRO3-Offload:
+```shell
+# Experimental Environment: 4 * A100
+# GPU Memory Requirement: 4 * 12GB
+# Runtime: 60 hours
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+    --model_id_or_path AI-ModelScope/WizardLM-2-8x22B \
+    --dataset blossom-math-zh \
+    --num_train_epochs 5 \
+    --sft_type lora \
+    --output_dir output \
+    --deepspeed zero3-offload \
+```
+
+
 ### Inference
 Original model:
 ```shell
diff --git a/README_CN.md b/README_CN.md
index ef720c9267..1c0c7abb5c 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -316,6 +316,22 @@ swift sft \
     --deepspeed default-zero3 \
 ```
 
+ZeRO3-Offload:
+```shell
+# 实验环境: 4 * A100
+# 显存需求: 4 * 12GB
+# 运行时长: 60小时
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+    --model_id_or_path AI-ModelScope/WizardLM-2-8x22B \
+    --dataset blossom-math-zh \
+    --num_train_epochs 5 \
+    --sft_type lora \
+    --output_dir output \
+    --deepspeed zero3-offload \
+```
+
 ### 推理
 原始模型:
 ```shell

From 15abebae3485692034170ef86fc393b298caf8b0 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 13:57:01 +0800
Subject: [PATCH 23/26] update readme

---
 README.md    | 2 ++
 README_CN.md | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 7e565f99e9..6eb9a768b1 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,8 @@ To facilitate use by users unfamiliar with deep learning, we provide a Gradio we
 Additionally, we are expanding capabilities for other modalities. Currently, we support full-parameter training and LoRA training for AnimateDiff.
 
 ## 🎉 News
+- 2024.04.18: Supported models: wizardlm2-7b-awq, wizardlm2-8x22b, yi-6b-chat-awq, yi-6b-chat-int8, yi-34b-chat-awq, yi-34b-chat-int8. Supported `--deepspeed zero3-offload` and provided default zero3-offload configuration file for zero3+cpu offload usage.
+- 2024.04.18: Supported compatibility with HuggingFace ecosystem using the environment variable `USE_HF`, switching to use models and datasets from HF. Please refer to the [HuggingFace ecosystem compatibility documentation](https://github.com/modelscope/swift/tree/main/docs/source_en/LLM/Compat-HF.md).
 - 2024.04.17: Support the evaluation for OpenAI standard interfaces. Check the [parameter documentation](docs/source_en/LLM/Command-line-parameters.md#eval-parameters) for details.
 - 🔥2024.04.17: Support **CodeQwen1.5-7B** series: CodeQwen1.5-7B, CodeQwen1.5-7B-Chat,CodeQwen1.5-7B-Chat-AWQ, use [this script](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/codeqwen1half_7b_chat/lora/sft.sh) to train.
 - 2024.04.16: Supports inference and fine-tuning of llava-v1.6-34b model. For best practice, you can refer to [here](https://github.com/modelscope/swift/tree/main/docs/source_en/Multi-Modal/llava-best-practice.md).
diff --git a/README_CN.md b/README_CN.md
index 1c0c7abb5c..3e1891b33b 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -40,6 +40,8 @@ SWIFT支持近**200种LLM和MLLM**（多模态大模型）的训练、推理、
 此外，我们也在拓展其他模态的能力，目前我们支持了AnimateDiff的全参数训练和LoRA训练。
 
 ## 🎉 新闻
+- 2024.04.18: 支持模型: wizardlm2-7b-awq, wizardlm2-8x22b, yi-6b-chat-awq, yi-6b-chat-int8, yi-34b-chat-awq, yi-34b-chat-int8. 支持`--deepspeed zero3-offload`, 提供了默认zero3-offload配置文件来使用zero3+cpu offload.
+- 2024.04.18: 支持使用环境变量`USE_HF`兼容HuggingFace生态, 切换成使用HF中的模型和数据集, 可以查看[HuggingFace生态兼容文档](https://github.com/modelscope/swift/tree/main/docs/source/LLM/HuggingFace生态兼容.md).
 - 2024.04.17: 支持OpenAI样式的接口评测, 可以查看[评测参数接口文档](docs/source/LLM/命令行参数.md#eval参数)来查看使用方法.
 - 🔥2024.04.17: 支持 **CodeQwen1.5-7B**系列: CodeQwen1.5-7B, CodeQwen1.5-7B-Chat,CodeQwen1.5-7B-Chat-AWQ, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/codeqwen1half_7b_chat/lora/sft.sh)来开始训练！
 - 2024.04.16: 支持llava-v1.6-34b的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/llava最佳实践.md).

From bd3a4a25bf8f885b6c7ac239a5c7ab33b295c89c Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 14:20:32 +0800
Subject: [PATCH 24/26] update readme

---
 docs/source_en/LLM/Compat-HF.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source_en/LLM/Compat-HF.md b/docs/source_en/LLM/Compat-HF.md
index bbc8da4534..14c2a8a073 100644
--- a/docs/source_en/LLM/Compat-HF.md
+++ b/docs/source_en/LLM/Compat-HF.md
@@ -9,7 +9,7 @@ Here is an example inference script for qwen1.5-7b-chat:
 USE_HF=1 CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat
 ```
 
-微调脚本:
+Fine-tuning script:
 ```shell
 # Experimental Environment: 2 * A100
 # GPU Memory Requirement: 2 * 30GB

From 418f8d696be1da441bc61a04890d9b1f33486984 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 15:06:30 +0800
Subject: [PATCH 25/26] better aqlm

---
 swift/llm/utils/model.py | 68 ++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 2d523d9640..a37d2307da 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -610,12 +610,20 @@ def get_model_tokenizer_from_repo(model_dir: str,
                                   **kwargs):
     """load from an independent repository"""
     is_awq = kwargs.pop('is_awq', False)
+    is_aqlm = kwargs.pop('is_aqlm', False)
     gptq_bits = kwargs.pop('gptq_bits', 0)
     is_training = kwargs.pop('is_training', False)
     if is_awq and is_training:
         _check_awq_ext()
     if gptq_bits > 0 and is_training:
         _check_gptq_model(gptq_bits, model_kwargs)
+    context = kwargs.get('context', None)
+    if is_aqlm and is_training:
+        require_version('transformers>=4.39')
+        import aqlm
+        context = aqlm.optimize_for_training()
+    if context is None:
+        context = nullcontext()
     if model_config is None:
         model_config = AutoConfig.from_pretrained(
             model_dir, trust_remote_code=True)
@@ -628,7 +636,6 @@ def get_model_tokenizer_from_repo(model_dir: str,
     if eos_token is not None:
         tokenizer.eos_token = eos_token
     model = None
-    context = kwargs.get('context', nullcontext())
     if load_model:
         with context:
             model = automodel_class.from_pretrained(
@@ -1657,42 +1664,6 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
         **kwargs)
 
 
-@register_model(
-    ModelType.llama2_7b_aqlm_2bit_1x16,
-    'AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf',
-    LoRATM.llama2,
-    TemplateType.default_generation_bos,
-    ignore_file_pattern=[r'.+\.bin$'],
-    support_flash_attn=True,
-    requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'],
-    support_vllm=False,
-    hf_model_id='ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf')
-@register_model(
-    ModelType.mixtral_moe_7b_aqlm_2bit_1x16,
-    'AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf',
-    LoRATM.llama2,
-    TemplateType.default_generation_bos,
-    requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'],
-    support_flash_attn=True,
-    support_vllm=False,
-    support_gradient_checkpointing=False,
-    hf_model_id='ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf')
-def get_model_tokenizer_aqlm(model_dir: str,
-                             torch_dtype: Dtype,
-                             model_kwargs: Dict[str, Any],
-                             load_model: bool = True,
-                             **kwargs):
-    import aqlm
-    context = aqlm.optimize_for_training()
-    return get_model_tokenizer_llama2(
-        model_dir,
-        torch_dtype,
-        model_kwargs,
-        load_model,
-        context=context,
-        **kwargs)
-
-
 @register_model(
     ModelType.qwen1half_0_5b_chat_awq,
     'qwen/Qwen1.5-0.5B-Chat-AWQ',
@@ -2358,6 +2329,29 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     return model, tokenizer
 
 
+
+@register_model(
+    ModelType.llama2_7b_aqlm_2bit_1x16,
+    'AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    ignore_file_pattern=[r'.+\.bin$'],
+    support_flash_attn=True,
+    requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'],
+    support_vllm=False,
+    function_kwargs={'is_aqlm': True},
+    hf_model_id='ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf')
+@register_model(
+    ModelType.mixtral_moe_7b_aqlm_2bit_1x16,
+    'AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'],
+    support_flash_attn=True,
+    support_vllm=False,
+    support_gradient_checkpointing=False,
+    function_kwargs={'is_aqlm': True},
+    hf_model_id='ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf')
 @register_model(
     ModelType.llama2_7b,
     'modelscope/Llama-2-7b-ms',

From 261d5c024995e5ddb700bcb04583465fa8498315 Mon Sep 17 00:00:00 2001
From: "huangjintao.hjt" <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 18 Apr 2024 15:08:55 +0800
Subject: [PATCH 26/26] lint pass

---
 swift/llm/utils/model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index a37d2307da..bd6158b526 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -2329,7 +2329,6 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     return model, tokenizer
 
 
-
 @register_model(
     ModelType.llama2_7b_aqlm_2bit_1x16,
     'AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf',