From fb974069eaaadf23a60bc58f480744e728c50d78 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 13:39:47 +0800 Subject: [PATCH 01/26] update use_hf --- ...271\211\344\270\216\346\213\223\345\261\225.md" | 3 +-- docs/source/Multi-Modal/index.md | 14 +++++++------- docs/source_en/LLM/Customization.md | 3 +-- swift/llm/dpo.py | 3 +-- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" index fac24df81f..6815159727 100644 --- "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" +++ "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" @@ -257,8 +257,7 @@ if __name__ == '__main__': - `get_function`: 默认值为`None`. 获取model和tokenizer的函数. 如果传入None, 则使用修饰器方案进行模型注册. 如果传入一个函数, 则使用正常方案进行注册. - `requires`: 默认为`[]`. 表示模型所需要的区别于其他模型的依赖. 该参数一般不需要设置. - `torch_dtype`: 默认为`None`. 表示模型所推荐使用的torch_dtype. 该参数一般不需要设置. -- `use_hf`: 默认为`False`, 即设置为modelscope hub. 如果你要使用huggingface hub, 你可以设置为True. -- `revision`: 默认为`None`. 用于指定模型的版本号, 如果`use_hf`为False, 则设置为'master', 如果`use_hf`为True, 则设置为'main'. 如果`model_id_or_path`是本地的模型目录, 则该参数失效. 该参数一般不需要设置. +- `revision`: 默认为`None`. 用于指定模型的版本号. 如果`model_id_or_path`是本地的模型目录, 则该参数失效. 该参数一般不需要设置. - `ignore_file_pattern`: 默认为`None`. 表示下载的时候需要忽略的文件名的正则pattern, 该参数会传递给`snapshot_download`. 例如`r'.+\.bin$'`, `r'.+\.savetensors$'`等. 该参数一般不需要设置. - `**kwargs`: 其他用于注释模型能力的参数. 该参数一般不需要设置. diff --git a/docs/source/Multi-Modal/index.md b/docs/source/Multi-Modal/index.md index fecbb3c493..50475ace57 100644 --- a/docs/source/Multi-Modal/index.md +++ b/docs/source/Multi-Modal/index.md @@ -3,11 +3,11 @@ ### Multi-Modal最佳实践系列 1. [Qwen-VL最佳实践](qwen-vl最佳实践.md) -2. [Qwen-Audio最佳实践](qwen-auidio最佳实践.md) +2. [Qwen-Audio最佳实践](qwen-audio最佳实践.md) 3. [Llava最佳实践](llava最佳实践.md) -4. [Deepseek-VL最佳实践](../Multi-Modal/deepseek-vl最佳实践.md) -5. [Yi-VL最佳实践.md](../Multi-Modal/yi-vl最佳实践.md) -6. [Internlm2-Xcomposers最佳实践](../Multi-Modal/internlm-xcomposer2最佳实践.md) -7. [MiniCPM-V最佳实践](../Multi-Modal/minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](../Multi-Modal/minicpm-v-2最佳实践.md) -8. [CogVLM最佳实践](../Multi-Modal/cogvlm最佳实践.md) -9. [mPLUG-Owl2最佳实践](../Multi-Modal/mplug-owl2最佳实践.md) +4. [Deepseek-VL最佳实践](deepseek-vl最佳实践.md) +5. [Yi-VL最佳实践.md](yi-vl最佳实践.md) +6. [Internlm2-Xcomposers最佳实践](internlm-xcomposer2最佳实践.md) +7. [MiniCPM-V最佳实践](minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](minicpm-v-2最佳实践.md) +8. [CogVLM最佳实践](cogvlm最佳实践.md) +9. [mPLUG-Owl2最佳实践](mplug-owl2最佳实践.md) diff --git a/docs/source_en/LLM/Customization.md b/docs/source_en/LLM/Customization.md index 59e7d1939f..1cdf6a8ccd 100644 --- a/docs/source_en/LLM/Customization.md +++ b/docs/source_en/LLM/Customization.md @@ -256,8 +256,7 @@ if __name__ == '__main__': - `get_function`: Default value is `None`. The function to get model and tokenizer. If passed `None`, the decorator approach will be used to register the model. If passed a function, the normal approach will be used to register. - `requires`: Default is `[]`. Represents the dependencies required by the model that differ from other models. This parameter generally does not need to be set. - `torch_dtype`: Default is `None`. Represents the recommended torch_dtype for the model to use. This parameter generally does not need to be set. -- `use_hf`: Default is `False`, i.e. set to modelscope hub. If you want to use huggingface hub, you can set it to True. -- `revision`: Default is `None`. Used to specify the version number of the model. If `use_hf` is False, it is set to 'master', if `use_hf` is True, it is set to 'main'. If `model_id_or_path` is a local model directory, this parameter is not effective. This parameter generally does not need to be set. +- `revision`: Default is `None`. Used to specify the version number of the model. If `model_id_or_path` is a local model directory, this parameter is not effective. This parameter generally does not need to be set. - `ignore_file_pattern`: Default is `None`. Represents the regular pattern of file names to be ignored when downloading, this parameter will be passed to `snapshot_download`. For example, `r'.+\.bin$'`, `r'.+\.savetensors$'`, etc. This parameter generally does not need to be set. - `**kwargs`: Other parameters used to annotate model capabilities. This parameter generally does not need to be set. diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py index a43a46a724..19d6d9b543 100644 --- a/swift/llm/dpo.py +++ b/swift/llm/dpo.py @@ -12,10 +12,9 @@ from swift.utils import (check_json_format, get_dist_setting, get_logger, get_main, get_model_info, is_ddp_plus_mp, is_dist, is_master, plot_images, seed_everything, show_layers) -from . import get_time_info from .tuner import prepare_model from .utils import (DPOArguments, Template, get_dataset, get_model_tokenizer, - get_template, set_generation_config) + get_template, set_generation_config, get_time_info) logger = get_logger() From 1432b32c1acb8bb226798c8c9a6645375ff1dba5 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 15:47:36 +0800 Subject: [PATCH 02/26] update use_hf --- swift/llm/dpo.py | 2 +- swift/llm/utils/model.py | 798 +++++++++++++++++++++++++-------------- 2 files changed, 524 insertions(+), 276 deletions(-) diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py index 19d6d9b543..58e6de18cc 100644 --- a/swift/llm/dpo.py +++ b/swift/llm/dpo.py @@ -14,7 +14,7 @@ is_master, plot_images, seed_everything, show_layers) from .tuner import prepare_model from .utils import (DPOArguments, Template, get_dataset, get_model_tokenizer, - get_template, set_generation_config, get_time_info) + get_template, get_time_info, set_generation_config) logger = get_logger() diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 0b5c2cb9a4..2cf09e95c9 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -23,6 +23,7 @@ PreTrainedTokenizerBase) from transformers.dynamic_module_utils import get_class_from_dynamic_module from transformers.models.auto.tokenization_auto import get_tokenizer_config +from transformers.utils import strtobool from transformers.utils.versions import require_version from swift import get_logger @@ -95,6 +96,7 @@ class ModelType: qwen1half_4b_chat_awq = 'qwen1half-4b-chat-awq' qwen1half_7b_chat_awq = 'qwen1half-7b-chat-awq' qwen1half_14b_chat_awq = 'qwen1half-14b-chat-awq' + qwen1half_32b_chat_awq = 'qwen1half-32b-chat-awq' qwen1half_72b_chat_awq = 'qwen1half-72b-chat-awq' # qwen-vl @@ -127,6 +129,7 @@ class ModelType: yi_6b_200k = 'yi-6b-200k' yi_6b_chat = 'yi-6b-chat' yi_9b = 'yi-9b' + yi_9b_200k = 'yi-9b-200k' yi_34b = 'yi-34b' yi_34b_200k = 'yi-34b-200k' yi_34b_chat = 'yi-34b-chat' @@ -355,7 +358,7 @@ def register_model( *, requires: Optional[List[str]] = None, torch_dtype: Optional[Dtype] = None, - use_hf: bool = False, + hf_model_id: Optional[str] = None, revision: Optional[str] = None, ignore_file_pattern: Optional[List[str]] = None, function_kwargs: Optional[Dict[str, Any]] = None, @@ -373,7 +376,7 @@ def register_model( if function_kwargs is None: function_kwargs = {} if revision is None: - revision = 'main' if use_hf else 'master' + revision = 'master' model_info = { 'model_id_or_path': model_id_or_path, 'lora_target_modules': lora_target_modules, @@ -381,7 +384,7 @@ def register_model( 'requires': requires, 'torch_dtype': torch_dtype, 'ignore_file_pattern': ignore_file_pattern, - 'use_hf': use_hf, + 'hf_model_id': hf_model_id, 'revision': revision, 'eos_token': eos_token, **kwargs @@ -412,70 +415,125 @@ def _register_model( 'Shanghai_AI_Laboratory/internlm-20b', LoRATM.llama2, TemplateType.default_generation_bos, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-20b') @register_model( ModelType.internlm_7b, 'Shanghai_AI_Laboratory/internlm-7b', LoRATM.llama2, TemplateType.default_generation_bos, - support_vllm=True) -@register_model(ModelType.bluelm_7b_chat_32k, 'vivo-ai/BlueLM-7B-Chat-32K', - LoRATM.llama2, TemplateType.bluelm) -@register_model(ModelType.bluelm_7b_chat, 'vivo-ai/BlueLM-7B-Chat', - LoRATM.llama2, TemplateType.bluelm) -@register_model(ModelType.bluelm_7b_32k, 'vivo-ai/BlueLM-7B-Base-32K', - LoRATM.llama2, TemplateType.default_generation_bos) -@register_model(ModelType.bluelm_7b, 'vivo-ai/BlueLM-7B-Base', LoRATM.llama2, - TemplateType.default_generation_bos) + support_vllm=True, + hf_model_id='internlm/internlm-7b') +@register_model( + ModelType.bluelm_7b_chat_32k, + 'vivo-ai/BlueLM-7B-Chat-32K', + LoRATM.llama2, + TemplateType.bluelm, + hf_model_id='vivo-ai/BlueLM-7B-Chat-32K') +@register_model( + ModelType.bluelm_7b_chat, + 'vivo-ai/BlueLM-7B-Chat', + LoRATM.llama2, + TemplateType.bluelm, + hf_model_id='vivo-ai/BlueLM-7B-Chat') +@register_model( + ModelType.bluelm_7b_32k, + 'vivo-ai/BlueLM-7B-Base-32K', + LoRATM.llama2, + TemplateType.default_generation_bos, + hf_model_id='vivo-ai/BlueLM-7B-Base-32K') +@register_model( + ModelType.bluelm_7b, + 'vivo-ai/BlueLM-7B-Base', + LoRATM.llama2, + TemplateType.default_generation_bos, + hf_model_id='vivo-ai/BlueLM-7B-Base') @register_model( ModelType.seqgpt_560m, 'damo/nlp_seqgpt-560m', LoRATM.bloom, TemplateType.default_generation, - support_vllm=True) -@register_model(ModelType.xverse_13b_chat, 'xverse/XVERSE-13B-Chat', - LoRATM.llama2, TemplateType.xverse) -@register_model(ModelType.xverse_13b, 'xverse/XVERSE-13B', LoRATM.llama2, - TemplateType.default_generation) -@register_model(ModelType.xverse_65b, 'xverse/XVERSE-65B', LoRATM.llama2, - TemplateType.default_generation) -@register_model(ModelType.xverse_65b_v2, 'xverse/XVERSE-65B-2', LoRATM.llama2, - TemplateType.default_generation) -@register_model(ModelType.xverse_65b_chat, 'xverse/XVERSE-65B-Chat', - LoRATM.llama2, TemplateType.xverse) + support_vllm=True, + hf_model_id='DAMO-NLP/SeqGPT-560M') +@register_model( + ModelType.xverse_13b_chat, + 'xverse/XVERSE-13B-Chat', + LoRATM.llama2, + TemplateType.xverse, + hf_model_id='xverse/XVERSE-13B-Chat') +@register_model( + ModelType.xverse_13b, + 'xverse/XVERSE-13B', + LoRATM.llama2, + TemplateType.default_generation, + hf_model_id='xverse/XVERSE-13B') +@register_model( + ModelType.xverse_65b, + 'xverse/XVERSE-65B', + LoRATM.llama2, + TemplateType.default_generation, + hf_model_id='xverse/XVERSE-65B') +@register_model( + ModelType.xverse_65b_v2, + 'xverse/XVERSE-65B-2', + LoRATM.llama2, + TemplateType.default_generation, + hf_model_id='xverse/XVERSE-65B-2') +@register_model( + ModelType.xverse_65b_chat, + 'xverse/XVERSE-65B-Chat', + LoRATM.llama2, + TemplateType.xverse, + hf_model_id='xverse/XVERSE-65B-Chat') @register_model( ModelType.xverse_13b_256k, 'xverse/XVERSE-13B-256K', LoRATM.llama2, TemplateType.default_generation, - revision='v1.0.0') -@register_model(ModelType.xverse_7b_chat, 'xverse/XVERSE-7B-Chat', - LoRATM.llama2, TemplateType.xverse) -@register_model(ModelType.xverse_7b, 'xverse/XVERSE-7B', LoRATM.llama2, - TemplateType.default_generation) -@register_model(ModelType.xverse_moe_a4_2b, 'xverse/XVERSE-MoE-A4.2B', - LoRATM.llama2, TemplateType.default_generation) + revision='v1.0.0', + hf_model_id='xverse/XVERSE-13B-256K') +@register_model( + ModelType.xverse_7b_chat, + 'xverse/XVERSE-7B-Chat', + LoRATM.llama2, + TemplateType.xverse, + hf_model_id='xverse/XVERSE-7B-Chat') +@register_model( + ModelType.xverse_7b, + 'xverse/XVERSE-7B', + LoRATM.llama2, + TemplateType.default_generation, + hf_model_id='xverse/XVERSE-7B') +@register_model( + ModelType.xverse_moe_a4_2b, + 'xverse/XVERSE-MoE-A4.2B', + LoRATM.llama2, + TemplateType.default_generation, + hf_model_id='xverse/XVERSE-MoE-A4.2B') @register_model( ModelType.baichuan_13b_chat, 'baichuan-inc/Baichuan-13B-Chat', LoRATM.baichuan, TemplateType.baichuan, requires=['transformers<4.34'], - support_vllm=True) + support_vllm=True, + hf_model_id='baichuan-inc/Baichuan-13B-Chat') @register_model( ModelType.baichuan_7b, 'baichuan-inc/baichuan-7B', LoRATM.baichuan, TemplateType.default_generation, requires=['transformers<4.34'], - support_vllm=True) + support_vllm=True, + hf_model_id='baichuan-inc/Baichuan-7B') @register_model( ModelType.mengzi3_13b_base, 'langboat/Mengzi3-13B-Base', LoRATM.llama2, TemplateType.mengzi, support_vllm=True, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Langboat/Mengzi3-13B-Base') @register_model( ModelType.c4ai_command_r_v01, 'AI-ModelScope/c4ai-command-r-v01', @@ -483,7 +541,8 @@ def _register_model( TemplateType.c4ai, requires=['transformers>=4.39.1'], support_vllm=False, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='CohereForAI/c4ai-command-r-v01') @register_model( ModelType.c4ai_command_r_plus, 'AI-ModelScope/c4ai-command-r-plus', @@ -491,7 +550,8 @@ def _register_model( TemplateType.c4ai, requires=['transformers>4.39'], support_vllm=False, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='CohereForAI/c4ai-command-r-plus') def get_model_tokenizer_from_repo(model_dir: str, torch_dtype: Optional[Dtype], model_kwargs: Dict[str, Any], @@ -531,7 +591,8 @@ def get_model_tokenizer_from_repo(model_dir: str, LoRATM.grok_1, TemplateType.default_generation, support_vllm=False, - support_flash_attn=False) + support_flash_attn=False, + hf_model_id='hpcai-tech/grok-1') def get_model_tokenizer_grok(model_dir: str, torch_dtype: Optional[Dtype], model_kwargs: Dict[str, Any], @@ -568,42 +629,48 @@ def get_model_tokenizer_grok(model_dir: str, LoRATM.mamba, TemplateType.default_generation, requires=['transformers>=4.39.0'], - support_vllm=False) + support_vllm=False, + hf_model_id='state-spaces/mamba-130m-hf') @register_model( ModelType.mamba_370m, 'AI-ModelScope/mamba-370m-hf', LoRATM.mamba, TemplateType.default_generation, requires=['transformers>=4.39.0'], - support_vllm=False) + support_vllm=False, + hf_model_id='state-spaces/mamba-370m-hf') @register_model( ModelType.mamba_390m, 'AI-ModelScope/mamba-390m-hf', LoRATM.mamba, TemplateType.default_generation, requires=['transformers>=4.39.0'], - support_vllm=False) + support_vllm=False, + hf_model_id='state-spaces/mamba-390m-hf') @register_model( ModelType.mamba_790m, 'AI-ModelScope/mamba-790m-hf', LoRATM.mamba, TemplateType.default_generation, requires=['transformers>=4.39.0'], - support_vllm=False) + support_vllm=False, + hf_model_id='state-spaces/mamba-790m-hf') @register_model( ModelType.mamba_1_4b, 'AI-ModelScope/mamba-1.4b-hf', LoRATM.mamba, TemplateType.default_generation, requires=['transformers>=4.39.0'], - support_vllm=False) + support_vllm=False, + hf_model_id='state-spaces/mamba-1.4b-hf') @register_model( ModelType.mamba_2_8b, 'AI-ModelScope/mamba-2.8b-hf', LoRATM.mamba, TemplateType.default_generation, requires=['transformers>=4.39.0'], - support_vllm=False) + support_vllm=False, + hf_model_id='state-spaces/mamba-2.8b-hf') def get_model_tokenizer_mamba(model_dir: str, torch_dtype: Optional[Dtype], model_kwargs: Dict[str, Any], @@ -622,21 +689,24 @@ def get_model_tokenizer_mamba(model_dir: str, LoRATM.cogvlm, TemplateType.cogvlm_instruct, support_gradient_checkpointing=False, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='THUDM/cogvlm-chat-hf') @register_model( ModelType.cogagent_18b_chat, 'ZhipuAI/cogagent-chat', LoRATM.cogagent, TemplateType.cogagent_chat, support_gradient_checkpointing=False, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='THUDM/cogagent-chat-hf') @register_model( ModelType.cogagent_18b_instruct, 'ZhipuAI/cogagent-vqa', LoRATM.cogagent, TemplateType.cogagent_instruct, support_gradient_checkpointing=False, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='THUDM/cogagent-vqa-hf') def get_model_tokenizer_cogagent(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -666,7 +736,8 @@ def get_model_tokenizer_cogagent(model_dir: str, 'Shanghai_AI_Laboratory/internlm-chat-20b', LoRATM.llama2, TemplateType.internlm, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-chat-20b') @register_model( ModelType.internlm_7b_chat_8k, 'Shanghai_AI_Laboratory/internlm-chat-7b-8k', @@ -675,10 +746,11 @@ def get_model_tokenizer_cogagent(model_dir: str, support_vllm=True) @register_model( ModelType.internlm_7b_chat, - 'Shanghai_AI_Laboratory/internlm-chat-7b-v1_1', + 'Shanghai_AI_Laboratory/internlm-chat-7b', LoRATM.llama2, TemplateType.internlm, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm-chat-7b') def get_model_tokenizer_internlm_chat(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -699,7 +771,8 @@ def get_model_tokenizer_internlm_chat(model_dir: str, LoRATM.baichuan, TemplateType.default_generation, requires=['transformers<4.34'], - support_vllm=True) + support_vllm=True, + hf_model_id='baichuan-inc/Baichuan-13B-Base') def get_model_tokenizer_baichuan_13b(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -722,13 +795,15 @@ def get_model_tokenizer_baichuan_13b(model_dir: str, 'baichuan-inc/Baichuan2-13B-Chat', LoRATM.baichuan, TemplateType.baichuan, - support_vllm=True) + support_vllm=True, + hf_model_id='baichuan-inc/Baichuan2-13B-Chat') @register_model( ModelType.baichuan2_13b, 'baichuan-inc/Baichuan2-13B-Base', LoRATM.baichuan, TemplateType.default_generation, - support_vllm=True) + support_vllm=True, + hf_model_id='baichuan-inc/Baichuan2-13B-Base') def get_model_tokenizer_baichuan2_13b(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -767,13 +842,15 @@ def patch_baichuan2_lm_head_forward(self, hidden_states: Tensor) -> Tensor: 'baichuan-inc/Baichuan2-7B-Chat', LoRATM.baichuan, TemplateType.baichuan, - support_vllm=True) + support_vllm=True, + hf_model_id='baichuan-inc/Baichuan2-7B-Chat') @register_model( ModelType.baichuan2_7b, 'baichuan-inc/Baichuan2-7B-Base', LoRATM.baichuan, TemplateType.default_generation, - support_vllm=True) + support_vllm=True, + hf_model_id='baichuan-inc/Baichuan2-7B-Base') def get_model_tokenizer_baichuan2(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -811,14 +888,16 @@ def get_model_tokenizer_baichuan2(model_dir: str, 'get_baichuan2_function': get_model_tokenizer_baichuan2_13b }, torch_dtype=torch.bfloat16, - requires=['bitsandbytes<0.41.2', 'accelerate<0.26']) + requires=['bitsandbytes<0.41.2', 'accelerate<0.26'], + hf_model_id='baichuan-inc/Baichuan2-13B-Chat-4bits') @register_model( ModelType.baichuan2_7b_chat_int4, 'baichuan-inc/Baichuan2-7B-Chat-4bits', LoRATM.baichuan, TemplateType.baichuan, torch_dtype=torch.bfloat16, - requires=['bitsandbytes<0.41.2', 'accelerate<0.26']) + requires=['bitsandbytes<0.41.2', 'accelerate<0.26'], + hf_model_id='baichuan-inc/Baichuan2-7B-Chat-4bits') def get_model_tokenizer_baichuan2_int4(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -864,37 +943,43 @@ def remove_property(tokenizer_cls: Type[PreTrainedTokenizerBase], TemplateType.codefuse, requires=['transformers<4.34'], support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='codefuse-ai/CodeFuse-CodeGeeX2-6B') @register_model( ModelType.chatglm3_6b_32k, 'ZhipuAI/chatglm3-6b-32k', LoRATM.chatglm, TemplateType.chatglm3, - support_vllm=True) + support_vllm=True, + hf_model_id='THUDM/chatglm3-6b-32k') @register_model( ModelType.chatglm3_6b, 'ZhipuAI/chatglm3-6b', LoRATM.chatglm, TemplateType.chatglm3, - support_vllm=True) + support_vllm=True, + hf_model_id='THUDM/chatglm3-6b') @register_model( ModelType.chatglm3_6b_base, 'ZhipuAI/chatglm3-6b-base', LoRATM.chatglm, TemplateType.chatglm_generation, - support_vllm=True) + support_vllm=True, + hf_model_id='THUDM/chatglm3-6b-base') @register_model( ModelType.chatglm2_6b_32k, 'ZhipuAI/chatglm2-6b-32k', LoRATM.chatglm, TemplateType.chatglm2, - support_vllm=True) + support_vllm=True, + hf_model_id='THUDM/chatglm2-6b-32k') @register_model( ModelType.chatglm2_6b, 'ZhipuAI/chatglm2-6b', LoRATM.chatglm, TemplateType.chatglm2, - support_vllm=True) + support_vllm=True, + hf_model_id='THUDM/chatglm2-6b') @register_model( ModelType.codegeex2_6b, 'ZhipuAI/codegeex2-6b', @@ -902,7 +987,8 @@ def remove_property(tokenizer_cls: Type[PreTrainedTokenizerBase], TemplateType.chatglm_generation, requires=['transformers<4.34'], support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='THUDM/codegeex2-6b') def get_model_tokenizer_chatglm(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -945,7 +1031,8 @@ def cross_entropy_forward(self, inputs: Tensor, requires=['transformers>=4.38'], ignore_file_pattern=[r'.+\.gguf$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='google/gemma-2b') @register_model( ModelType.gemma_7b, 'AI-ModelScope/gemma-7b', @@ -954,7 +1041,8 @@ def cross_entropy_forward(self, inputs: Tensor, requires=['transformers>=4.38'], ignore_file_pattern=[r'.+\.gguf$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='google/gemma-7b') @register_model( ModelType.gemma_2b_instruct, 'AI-ModelScope/gemma-2b-it', @@ -963,7 +1051,8 @@ def cross_entropy_forward(self, inputs: Tensor, requires=['transformers>=4.38'], ignore_file_pattern=[r'.+\.gguf$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='google/gemma-2b-it') @register_model( ModelType.gemma_7b_instruct, 'AI-ModelScope/gemma-7b-it', @@ -972,7 +1061,8 @@ def cross_entropy_forward(self, inputs: Tensor, requires=['transformers>=4.38'], ignore_file_pattern=[r'.+\.gguf$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='google/gemma-7b-it') @register_model( ModelType.deepseek_math_7b_instruct, 'deepseek-ai/deepseek-math-7b-instruct', @@ -980,7 +1070,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.deepseek, support_flash_attn=True, support_vllm=True, - tags=['math']) + tags=['math'], + hf_model_id='deepseek-ai/deepseek-math-7b-instruct') @register_model( ModelType.deepseek_math_7b_chat, 'deepseek-ai/deepseek-math-7b-rl', @@ -988,7 +1079,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.deepseek, support_flash_attn=True, support_vllm=True, - tags=['math']) + tags=['math'], + hf_model_id='deepseek-ai/deepseek-math-7b-rl') @register_model( ModelType.deepseek_math_7b, 'deepseek-ai/deepseek-math-7b-base', @@ -996,7 +1088,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation_bos, support_flash_attn=True, support_vllm=True, - tags=['math']) + tags=['math'], + hf_model_id='deepseek-ai/deepseek-math-7b-base') @register_model( ModelType.qwen1half_0_5b, 'qwen/Qwen1.5-0.5B', @@ -1004,7 +1097,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-0.5B') @register_model( ModelType.qwen1half_1_8b, 'qwen/Qwen1.5-1.8B', @@ -1012,7 +1106,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-1.8B') @register_model( ModelType.qwen1half_4b, 'qwen/Qwen1.5-4B', @@ -1020,7 +1115,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-4B') @register_model( ModelType.qwen1half_7b, 'qwen/Qwen1.5-7B', @@ -1028,7 +1124,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-7B') @register_model( ModelType.qwen1half_14b, 'qwen/Qwen1.5-14B', @@ -1036,7 +1133,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-14B') @register_model( ModelType.qwen1half_32b, 'qwen/Qwen1.5-32B', @@ -1044,7 +1142,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-32B') @register_model( ModelType.qwen1half_72b, 'qwen/Qwen1.5-72B', @@ -1052,7 +1151,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-72B') @register_model( ModelType.qwen1half_moe_a2_7b, 'qwen/Qwen1.5-MoE-A2.7B', @@ -1060,7 +1160,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-MoE-A2.7B') @register_model( ModelType.deepseek_coder_1_3b, 'deepseek-ai/deepseek-coder-1.3b-base', @@ -1068,7 +1169,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation_bos, support_flash_attn=True, support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='deepseek-ai/deepseek-coder-1.3b-base') @register_model( ModelType.deepseek_coder_6_7b, 'deepseek-ai/deepseek-coder-6.7b-base', @@ -1076,7 +1178,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation_bos, support_flash_attn=True, support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='deepseek-ai/deepseek-coder-6.7b-base') @register_model( ModelType.deepseek_coder_33b, 'deepseek-ai/deepseek-coder-33b-base', @@ -1084,7 +1187,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation_bos, support_flash_attn=True, support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='deepseek-ai/deepseek-coder-33b-base') @register_model( ModelType.deepseek_coder_1_3b_instruct, 'deepseek-ai/deepseek-coder-1.3b-instruct', @@ -1093,7 +1197,8 @@ def cross_entropy_forward(self, inputs: Tensor, eos_token='<|EOT|>', support_flash_attn=True, support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='deepseek-ai/deepseek-coder-1.3b-instruct') @register_model( ModelType.deepseek_coder_6_7b_instruct, 'deepseek-ai/deepseek-coder-6.7b-instruct', @@ -1102,7 +1207,8 @@ def cross_entropy_forward(self, inputs: Tensor, eos_token='<|EOT|>', support_flash_attn=True, support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='deepseek-ai/deepseek-coder-6.7b-instruct') @register_model( ModelType.deepseek_coder_33b_instruct, 'deepseek-ai/deepseek-coder-33b-instruct', @@ -1111,49 +1217,56 @@ def cross_entropy_forward(self, inputs: Tensor, eos_token='<|EOT|>', support_flash_attn=True, support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='deepseek-ai/deepseek-coder-33b-instruct') @register_model( ModelType.openbuddy_deepseek_67b_chat, 'OpenBuddy/openbuddy-deepseek-67b-v15.2', LoRATM.llama2, TemplateType.openbuddy, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='OpenBuddy/openbuddy-deepseek-67b-v15.2') @register_model( ModelType.deepseek_67b_chat, 'deepseek-ai/deepseek-llm-67b-chat', LoRATM.llama2, TemplateType.deepseek, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='deepseek-ai/deepseek-llm-67b-chat') @register_model( ModelType.deepseek_67b, 'deepseek-ai/deepseek-llm-67b-base', LoRATM.llama2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='deepseek-ai/deepseek-llm-67b-base') @register_model( ModelType.deepseek_7b_chat, 'deepseek-ai/deepseek-llm-7b-chat', LoRATM.llama2, TemplateType.deepseek, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='deepseek-ai/deepseek-llm-7b-chat') @register_model( ModelType.deepseek_7b, 'deepseek-ai/deepseek-llm-7b-base', LoRATM.llama2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='deepseek-ai/deepseek-llm-7b-base') @register_model( ModelType.sus_34b_chat, 'SUSTC/SUS-Chat-34B', LoRATM.llama2, TemplateType.sus, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='SUSTech/SUS-Chat-34B') @register_model( ModelType.openbuddy_zephyr_7b_chat, 'OpenBuddy/openbuddy-zephyr-7b-v14.1', @@ -1161,7 +1274,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.openbuddy, requires=['transformers>=4.34'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='OpenBuddy/openbuddy-zephyr-7b-v14.1') @register_model( ModelType.zephyr_7b_beta_chat, 'modelscope/zephyr-7b-beta', @@ -1169,7 +1283,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.zephyr, requires=['transformers>=4.34'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='HuggingFaceH4/zephyr-7b-beta') @register_model( ModelType.yi_6b_chat, '01ai/Yi-6B-Chat', @@ -1177,7 +1292,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.yi, eos_token='<|im_end|>', support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='01-ai/Yi-6B-Chat') @register_model( ModelType.yi_34b_chat, '01ai/Yi-34B-Chat', @@ -1185,56 +1301,72 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.yi, eos_token='<|im_end|>', support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='01-ai/Yi-34B-Chat') @register_model( ModelType.yi_34b_200k, '01ai/Yi-34B-200K', LoRATM.llama2, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='01-ai/Yi-34B-200K') @register_model( ModelType.yi_34b, '01ai/Yi-34B', LoRATM.llama2, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='01-ai/Yi-34B') @register_model( ModelType.yi_6b_200k, '01ai/Yi-6B-200K', LoRATM.llama2, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='01-ai/Yi-6B-200K') @register_model( ModelType.yi_9b, '01ai/Yi-9B', LoRATM.llama2, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='01-ai/Yi-9B') +@register_model( + ModelType.yi_9b_200k, + '01ai/Yi-9B-200K', + LoRATM.llama2, + TemplateType.default_generation, + support_flash_attn=True, + support_vllm=True, + hf_model_id='01-ai/Yi-9B-200K') @register_model( ModelType.yi_6b, '01ai/Yi-6B', LoRATM.llama2, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='01-ai/Yi-6B') @register_model( ModelType.ziya2_13b_chat, 'Fengshenbang/Ziya2-13B-Chat', LoRATM.llama2, TemplateType.ziya, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='IDEA-CCNL/Ziya2-13B-Chat') @register_model( ModelType.ziya2_13b, 'Fengshenbang/Ziya2-13B-Base', LoRATM.llama2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='IDEA-CCNL/Ziya2-13B-Base') @register_model( ModelType.openbuddy_mixtral_moe_7b_chat, 'OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k', @@ -1243,7 +1375,8 @@ def cross_entropy_forward(self, inputs: Tensor, requires=['transformers>=4.36'], support_flash_attn=True, support_vllm=True, - support_gradient_checkpointing=False) + support_gradient_checkpointing=False, + hf_model_id='OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k') @register_model( ModelType.openbuddy_mistral_7b_chat, 'OpenBuddy/openbuddy-mistral-7b-v17.1-32k', @@ -1251,28 +1384,32 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.openbuddy, requires=['transformers>=4.34'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='OpenBuddy/openbuddy-mistral-7b-v17.1-32k') @register_model( ModelType.openbuddy_llama2_70b_chat, 'OpenBuddy/openbuddy-llama2-70b-v10.1-bf16', LoRATM.llama2, TemplateType.openbuddy, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='OpenBuddy/openbuddy-llama2-70b-v10.1-bf16') @register_model( ModelType.openbuddy_llama2_65b_chat, 'OpenBuddy/openbuddy-llama-65b-v8-bf16', LoRATM.llama2, TemplateType.openbuddy, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='OpenBuddy/openbuddy-llama-65b-v8-bf16') @register_model( ModelType.openbuddy_llama2_13b_chat, 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16', LoRATM.llama2, TemplateType.openbuddy, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='OpenBuddy/openbuddy-llama2-13b-v8.1-fp16') @register_model( ModelType.mistral_7b_instruct, 'AI-ModelScope/Mistral-7B-Instruct-v0.1', @@ -1280,7 +1417,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.llama, requires=['transformers>=4.34'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='mistralai/Mistral-7B-Instruct-v0.1') @register_model( ModelType.mistral_7b_instruct_v2, 'AI-ModelScope/Mistral-7B-Instruct-v0.2', @@ -1288,7 +1426,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.llama, requires=['transformers>=4.34'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='mistralai/Mistral-7B-Instruct-v0.2') @register_model( ModelType.mistral_7b, 'AI-ModelScope/Mistral-7B-v0.1', @@ -1296,7 +1435,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation_bos, requires=['transformers>=4.34'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='mistralai/Mistral-7B-v0.1') @register_model( ModelType.mistral_7b_v2, 'AI-ModelScope/Mistral-7B-v0.2-hf', @@ -1304,25 +1444,30 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation_bos, requires=['transformers>=4.34'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='alpindale/Mistral-7B-v0.2-hf') @register_model( ModelType.mixtral_moe_7b, 'AI-ModelScope/Mixtral-8x7B-v0.1', LoRATM.llama2, TemplateType.default_generation_bos, requires=['transformers>=4.36'], + ignore_file_pattern=[r'.+\.pt$'], support_flash_attn=True, support_vllm=True, - support_gradient_checkpointing=False) + support_gradient_checkpointing=False, + hf_model_id='mistralai/Mixtral-8x7B-v0.1') @register_model( ModelType.mixtral_moe_7b_instruct, 'AI-ModelScope/Mixtral-8x7B-Instruct-v0.1', LoRATM.llama2, TemplateType.llama, requires=['transformers>=4.36'], + ignore_file_pattern=[r'.+\.pt$'], support_flash_attn=True, support_vllm=True, - support_gradient_checkpointing=False) + support_gradient_checkpointing=False, + hf_model_id='mistralai/Mixtral-8x7B-Instruct-v0.1') @register_model( ModelType.mixtral_moe_8x22b_v1, 'AI-ModelScope/Mixtral-8x22B-v0.1', @@ -1330,7 +1475,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation_bos, requires=['transformers>=4.36'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='mistral-community/Mixtral-8x22B-v0.1') @register_model( ModelType.dbrx_base, 'AI-ModelScope/dbrx-base', @@ -1339,7 +1485,8 @@ def cross_entropy_forward(self, inputs: Tensor, requires=['transformers>=4.36'], support_flash_attn=True, support_vllm=True, - support_gradient_checkpointing=False) + support_gradient_checkpointing=False, + hf_model_id='databricks/dbrx-base') @register_model( ModelType.dbrx_instruct, 'AI-ModelScope/dbrx-instruct', @@ -1348,7 +1495,8 @@ def cross_entropy_forward(self, inputs: Tensor, requires=['transformers>=4.36'], support_flash_attn=True, support_vllm=True, - support_gradient_checkpointing=False) + support_gradient_checkpointing=False, + hf_model_id='databricks/dbrx-instruct') def get_model_tokenizer_with_flash_attn(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -1381,7 +1529,8 @@ def get_model_tokenizer_with_flash_attn(model_dir: str, ignore_file_pattern=[r'.+\.bin$'], support_flash_attn=True, requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'], - support_vllm=False) + support_vllm=False, + hf_model_id='ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf') @register_model( ModelType.mixtral_moe_7b_aqlm_2bit_1x16, 'AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf', @@ -1390,7 +1539,8 @@ def get_model_tokenizer_with_flash_attn(model_dir: str, requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'], support_flash_attn=True, support_vllm=False, - support_gradient_checkpointing=False) + support_gradient_checkpointing=False, + hf_model_id='ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf') def get_model_tokenizer_aqlm(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -1415,7 +1565,8 @@ def get_model_tokenizer_aqlm(model_dir: str, support_flash_attn=True, support_vllm=True, function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) + requires=['transformers>=4.37', 'autoawq'], + hf_model_id='Qwen/Qwen1.5-0.5B-Chat-AWQ') @register_model( ModelType.qwen1half_1_8b_chat_awq, 'qwen/Qwen1.5-1.8B-Chat-AWQ', @@ -1424,7 +1575,8 @@ def get_model_tokenizer_aqlm(model_dir: str, support_flash_attn=True, support_vllm=True, function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) + requires=['transformers>=4.37', 'autoawq'], + hf_model_id='Qwen/Qwen1.5-1.8B-Chat-AWQ') @register_model( ModelType.qwen1half_4b_chat_awq, 'qwen/Qwen1.5-4B-Chat-AWQ', @@ -1433,7 +1585,8 @@ def get_model_tokenizer_aqlm(model_dir: str, support_flash_attn=True, support_vllm=True, function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) + requires=['transformers>=4.37', 'autoawq'], + hf_model_id='Qwen/Qwen1.5-4B-Chat-AWQ') @register_model( ModelType.qwen1half_7b_chat_awq, 'qwen/Qwen1.5-7B-Chat-AWQ', @@ -1442,7 +1595,8 @@ def get_model_tokenizer_aqlm(model_dir: str, support_flash_attn=True, support_vllm=True, function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) + requires=['transformers>=4.37', 'autoawq'], + hf_model_id='Qwen/Qwen1.5-7B-Chat-AWQ') @register_model( ModelType.qwen1half_14b_chat_awq, 'qwen/Qwen1.5-14B-Chat-AWQ', @@ -1451,7 +1605,18 @@ def get_model_tokenizer_aqlm(model_dir: str, support_flash_attn=True, support_vllm=True, function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) + requires=['transformers>=4.37', 'autoawq'], + hf_model_id='Qwen/Qwen1.5-14B-Chat-AWQ') +@register_model( + ModelType.qwen1half_32b_chat_awq, + 'qwen/Qwen1.5-32B-Chat-AWQ', + LoRATM.qwen1half, + TemplateType.qwen, + support_flash_attn=True, + support_vllm=True, + function_kwargs={'is_awq': True}, + requires=['transformers>=4.37', 'autoawq'], + hf_model_id='Qwen/Qwen1.5-32B-Chat-AWQ') @register_model( ModelType.qwen1half_72b_chat_awq, 'qwen/Qwen1.5-72B-Chat-AWQ', @@ -1460,7 +1625,8 @@ def get_model_tokenizer_aqlm(model_dir: str, support_flash_attn=True, support_vllm=True, function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) + requires=['transformers>=4.37', 'autoawq'], + hf_model_id='Qwen/Qwen1.5-72B-Chat-AWQ') @register_model( ModelType.qwen1half_0_5b_chat, 'qwen/Qwen1.5-0.5B-Chat', @@ -1468,7 +1634,8 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-0.5B-Chat') @register_model( ModelType.qwen1half_1_8b_chat, 'qwen/Qwen1.5-1.8B-Chat', @@ -1476,7 +1643,8 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-1.8B-Chat') @register_model( ModelType.qwen1half_4b_chat, 'qwen/Qwen1.5-4B-Chat', @@ -1484,7 +1652,8 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-4B-Chat') @register_model( ModelType.qwen1half_7b_chat, 'qwen/Qwen1.5-7B-Chat', @@ -1492,7 +1661,8 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-7B-Chat') @register_model( ModelType.qwen1half_14b_chat, 'qwen/Qwen1.5-14B-Chat', @@ -1500,7 +1670,8 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-14B-Chat') @register_model( ModelType.qwen1half_32b_chat, 'qwen/Qwen1.5-32B-Chat', @@ -1508,7 +1679,8 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-32B-Chat') @register_model( ModelType.qwen1half_72b_chat, 'qwen/Qwen1.5-72B-Chat', @@ -1516,7 +1688,8 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-72B-Chat') @register_model( ModelType.qwen1half_moe_a2_7b_chat, 'qwen/Qwen1.5-MoE-A2.7B-Chat', @@ -1524,7 +1697,8 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/Qwen1.5-MoE-A2.7B-Chat') def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -1556,7 +1730,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4') @register_model( ModelType.qwen1half_0_5b_chat_int8, 'qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8', @@ -1565,7 +1740,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8') @register_model( ModelType.qwen1half_1_8b_chat_int4, 'qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4', @@ -1575,7 +1751,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4') @register_model( ModelType.qwen1half_1_8b_chat_int8, 'qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8', @@ -1584,7 +1761,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8') @register_model( ModelType.qwen1half_4b_chat_int4, 'qwen/Qwen1.5-4B-Chat-GPTQ-Int4', @@ -1594,7 +1772,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen1.5-4B-Chat-GPTQ-Int4') @register_model( ModelType.qwen1half_4b_chat_int8, 'qwen/Qwen1.5-4B-Chat-GPTQ-Int8', @@ -1603,7 +1782,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen1.5-4B-Chat-GPTQ-Int8') @register_model( ModelType.qwen1half_7b_chat_int4, 'qwen/Qwen1.5-7B-Chat-GPTQ-Int4', @@ -1613,7 +1793,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen1.5-7B-Chat-GPTQ-Int4') @register_model( ModelType.qwen1half_7b_chat_int8, 'qwen/Qwen1.5-7B-Chat-GPTQ-Int8', @@ -1622,7 +1803,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen1.5-7B-Chat-GPTQ-Int8') @register_model( ModelType.qwen1half_14b_chat_int4, 'qwen/Qwen1.5-14B-Chat-GPTQ-Int4', @@ -1632,7 +1814,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen1.5-14B-Chat-GPTQ-Int4') @register_model( ModelType.qwen1half_14b_chat_int8, 'qwen/Qwen1.5-14B-Chat-GPTQ-Int8', @@ -1641,7 +1824,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen1.5-14B-Chat-GPTQ-Int8') @register_model( ModelType.qwen1half_32b_chat_int4, 'qwen/Qwen1.5-32B-Chat-GPTQ-Int4', @@ -1651,7 +1835,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen1.5-32B-Chat-GPTQ-Int4') @register_model( ModelType.qwen1half_72b_chat_int4, 'qwen/Qwen1.5-72B-Chat-GPTQ-Int4', @@ -1661,7 +1846,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen1.5-72B-Chat-GPTQ-Int4') @register_model( ModelType.qwen1half_72b_chat_int8, 'qwen/Qwen1.5-72B-Chat-GPTQ-Int8', @@ -1670,7 +1856,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen1.5-72B-Chat-GPTQ-Int8') @register_model( ModelType.qwen1half_moe_a2_7b_chat_int4, 'qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4', @@ -1679,7 +1866,8 @@ def get_model_tokenizer_qwen1half(model_dir: str, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, function_kwargs={'bits': 4}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4') def get_model_tokenizer_qwen1half_intx(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -1696,7 +1884,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, LoRATM.internlm2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-1_8b') @register_model( ModelType.internlm2_1_8b_sft_chat, 'Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft', @@ -1704,7 +1893,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, TemplateType.internlm2, eos_token='<|im_end|>', support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-chat-1_8b-sft') @register_model( ModelType.internlm2_1_8b_chat, 'Shanghai_AI_Laboratory/internlm2-chat-1_8b', @@ -1712,7 +1902,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, TemplateType.internlm2, eos_token='<|im_end|>', support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-chat-1_8b') @register_model( ModelType.internlm2_math_7b, 'Shanghai_AI_Laboratory/internlm2-math-base-7b', @@ -1720,7 +1911,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, TemplateType.default_generation_bos, support_flash_attn=True, support_vllm=True, - tags=['math']) + tags=['math'], + hf_model_id='internlm/internlm2-math-base-7b') @register_model( ModelType.internlm2_math_20b, 'Shanghai_AI_Laboratory/internlm2-math-base-20b', @@ -1728,7 +1920,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, TemplateType.default_generation_bos, support_flash_attn=True, support_vllm=True, - tags=['math']) + tags=['math'], + hf_model_id='internlm/internlm2-math-base-20b') @register_model( ModelType.internlm2_math_7b_chat, 'Shanghai_AI_Laboratory/internlm2-math-7b', @@ -1737,7 +1930,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, eos_token='<|im_end|>', support_flash_attn=True, support_vllm=True, - tags=['math']) + tags=['math'], + hf_model_id='internlm/internlm2-math-7b') @register_model( ModelType.internlm2_math_20b_chat, 'Shanghai_AI_Laboratory/internlm2-math-20b', @@ -1746,7 +1940,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, eos_token='<|im_end|>', support_flash_attn=True, support_vllm=True, - tags=['math']) + tags=['math'], + hf_model_id='internlm/internlm2-math-20b') @register_model( ModelType.internlm2_7b_sft_chat, 'Shanghai_AI_Laboratory/internlm2-chat-7b-sft', @@ -1754,7 +1949,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, TemplateType.internlm2, eos_token='<|im_end|>', support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-chat-7b-sft') @register_model( ModelType.internlm2_7b_chat, 'Shanghai_AI_Laboratory/internlm2-chat-7b', @@ -1762,7 +1958,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, TemplateType.internlm2, eos_token='<|im_end|>', support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-chat-7b') @register_model( ModelType.internlm2_20b_sft_chat, 'Shanghai_AI_Laboratory/internlm2-chat-20b-sft', @@ -1770,7 +1967,8 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, TemplateType.internlm2, eos_token='<|im_end|>', support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-chat-20b-sft') @register_model( ModelType.internlm2_20b_chat, 'Shanghai_AI_Laboratory/internlm2-chat-20b', @@ -1778,35 +1976,40 @@ def get_model_tokenizer_qwen1half_intx(model_dir: str, TemplateType.internlm2, eos_token='<|im_end|>', support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-chat-20b') @register_model( ModelType.internlm2_7b, 'Shanghai_AI_Laboratory/internlm2-7b', LoRATM.internlm2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-7b') @register_model( ModelType.internlm2_7b_base, 'Shanghai_AI_Laboratory/internlm2-base-7b', LoRATM.internlm2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-base-7b') @register_model( ModelType.internlm2_20b, 'Shanghai_AI_Laboratory/internlm2-20b', LoRATM.internlm2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-20b') @register_model( ModelType.internlm2_20b_base, 'Shanghai_AI_Laboratory/internlm2-base-20b', LoRATM.internlm2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='internlm/internlm2-base-20b') def get_model_tokenizer_internlm2(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -1841,7 +2044,8 @@ def get_model_tokenizer_internlm2(model_dir: str, TemplateType.internlm_xcomposer2, eos_token='[UNUSED_TOKEN_145]', support_flash_attn=True, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='internlm/internlm-xcomposer2-7b') def get_model_tokenizer_internlm_xcomposer2(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -1951,14 +2155,16 @@ def _patch_deepseek_vl(model) -> None: LoRATM.llama2, TemplateType.deepseek_vl, support_flash_attn=True, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='deepseek-ai/deepseek-vl-7b-chat') @register_model( ModelType.deepseek_vl_1_3b_chat, 'deepseek-ai/deepseek-vl-1.3b-chat', LoRATM.llama2, TemplateType.deepseek_vl, support_flash_attn=True, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='deepseek-ai/deepseek-vl-1.3b-chat') def get_model_tokenizer_deepseek_vl(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2007,7 +2213,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, TemplateType.default_generation_bos, ignore_file_pattern=[r'.+\.bin$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='meta-llama/Llama-2-7b-hf') @register_model( ModelType.llama2_13b, 'modelscope/Llama-2-13b-ms', @@ -2015,7 +2222,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, TemplateType.default_generation_bos, ignore_file_pattern=[r'.+\.bin$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='meta-llama/Llama-2-13b-hf') @register_model( ModelType.llama2_70b, 'modelscope/Llama-2-70b-ms', @@ -2023,7 +2231,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, TemplateType.default_generation_bos, ignore_file_pattern=[r'.+\.bin$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='meta-llama/Llama-2-70b-hf') @register_model( ModelType.llama2_7b_chat, 'modelscope/Llama-2-7b-chat-ms', @@ -2031,7 +2240,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, TemplateType.llama, ignore_file_pattern=[r'.+\.bin$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='meta-llama/Llama-2-7b-chat-hf') @register_model( ModelType.llama2_13b_chat, 'modelscope/Llama-2-13b-chat-ms', @@ -2039,7 +2249,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, TemplateType.llama, ignore_file_pattern=[r'.+\.bin$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='meta-llama/Llama-2-13b-chat-hf') @register_model( ModelType.llama2_70b_chat, 'modelscope/Llama-2-70b-chat-ms', @@ -2047,7 +2258,8 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, TemplateType.llama, ignore_file_pattern=[r'.+\.bin$'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='meta-llama/Llama-2-70b-chat-hf') def get_model_tokenizer_llama2(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2065,8 +2277,12 @@ def get_model_tokenizer_llama2(model_dir: str, **kwargs) -@register_model(ModelType.polylm_13b, 'damo/nlp_polylm_13b_text_generation', - LoRATM.polylm, TemplateType.default_generation) +@register_model( + ModelType.polylm_13b, + 'damo/nlp_polylm_13b_text_generation', + LoRATM.polylm, + TemplateType.default_generation, + hf_model_id='DAMO-NLP-MT/polylm-13b') def get_model_tokenizer_polylm(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2139,21 +2355,24 @@ def get_model_tokenizer_qwen(model_dir: str, TemplateType.codefuse, support_flash_attn=True, support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='codefuse-ai/CodeFuse-QWen-14B') @register_model( ModelType.qwen_1_8b, 'qwen/Qwen-1_8B', LoRATM.qwen, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen1.5-1.8B') @register_model( ModelType.qwen_72b, 'qwen/Qwen-72B', LoRATM.qwen, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-72B') @register_model( ModelType.tongyi_finance_14b, 'TongyiFinance/Tongyi-Finance-14B', @@ -2168,14 +2387,16 @@ def get_model_tokenizer_qwen(model_dir: str, LoRATM.qwen, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-14B') @register_model( ModelType.qwen_7b, 'qwen/Qwen-7B', LoRATM.qwen, TemplateType.default_generation, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-7B') def get_model_tokenizer_qwen_base(*args, **kwargs): model, tokenizer = get_model_tokenizer_qwen(*args, **kwargs) tokenizer.eos_token_id = tokenizer.eod_id @@ -2188,14 +2409,16 @@ def get_model_tokenizer_qwen_base(*args, **kwargs): LoRATM.qwen, TemplateType.qwen, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-1_8B-Chat') @register_model( ModelType.qwen_72b_chat, 'qwen/Qwen-72B-Chat', LoRATM.qwen, TemplateType.qwen, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-72B-Chat') @register_model( ModelType.tongyi_finance_14b_chat, 'TongyiFinance/Tongyi-Finance-14B-Chat', @@ -2203,21 +2426,24 @@ def get_model_tokenizer_qwen_base(*args, **kwargs): TemplateType.qwen, support_flash_attn=True, support_vllm=True, - tags=['financial']) + tags=['financial'], + hf_model_id='jxy/Tongyi-Finance-14B-Chat') @register_model( ModelType.qwen_14b_chat, 'qwen/Qwen-14B-Chat', LoRATM.qwen, TemplateType.qwen, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-14B-Chat') @register_model( ModelType.qwen_7b_chat, 'qwen/Qwen-7B-Chat', LoRATM.qwen, TemplateType.qwen, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-7B-Chat') def get_model_tokenizer_qwen_chat(*args, **kwargs): model, tokenizer = get_model_tokenizer_qwen(*args, **kwargs) tokenizer.eos_token_id = tokenizer.im_end_id @@ -2279,7 +2505,8 @@ def _qwen_vl_audio_decode(self, LoRATM.qwen, TemplateType.qwen, support_flash_attn=True, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='Qwen/Qwen-VL-Chat') @register_model( ModelType.qwen_vl, 'qwen/Qwen-VL', @@ -2287,7 +2514,8 @@ def _qwen_vl_audio_decode(self, TemplateType.default_generation, function_kwargs={'get_qwen_function': get_model_tokenizer_qwen_base}, support_flash_attn=True, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='Qwen/Qwen-VL') def get_model_tokenizer_qwen_vl(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2341,7 +2569,8 @@ def get_model_tokenizer_qwen_vl(model_dir: str, TemplateType.qwen_audio, support_flash_attn=True, function_kwargs={'get_qwen_function': get_model_tokenizer_qwen_chat}, - tags=['multi-modal', 'audio']) + tags=['multi-modal', 'audio'], + hf_model_id='Qwen/Qwen-Audio-Chat') @register_model( ModelType.qwen_audio, 'qwen/Qwen-Audio', @@ -2349,7 +2578,8 @@ def get_model_tokenizer_qwen_vl(model_dir: str, TemplateType.qwen_audio_generation, support_flash_attn=True, function_kwargs={'get_qwen_function': get_model_tokenizer_qwen_base}, - tags=['multi-modal', 'audio']) + tags=['multi-modal', 'audio'], + hf_model_id='Qwen/Qwen-Audio') def get_model_tokenizer_qwen_audio(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2382,7 +2612,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen-1_8B-Chat-Int8') @register_model( ModelType.qwen_1_8b_chat_int4, 'qwen/Qwen-1_8B-Chat-Int4', @@ -2392,7 +2623,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-1_8B-Chat-Int4') @register_model( ModelType.qwen_72b_chat_int8, 'qwen/Qwen-72B-Chat-Int8', @@ -2401,7 +2633,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen-72B-Chat-Int8') @register_model( ModelType.qwen_72b_chat_int4, 'qwen/Qwen-72B-Chat-Int4', @@ -2411,7 +2644,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-72B-Chat-Int4') @register_model( ModelType.tongyi_finance_14b_chat_int4, 'TongyiFinance/Tongyi-Finance-14B-Chat-Int4', @@ -2422,7 +2656,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, function_kwargs={'bits': 4}, support_flash_attn=True, support_vllm=True, - tags=['financial']) + tags=['financial'], + hf_model_id='jxy/Tongyi-Finance-14B-Chat-Int4') @register_model( ModelType.qwen_vl_chat_int4, 'qwen/Qwen-VL-Chat-Int4', @@ -2435,7 +2670,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, 'bits': 4 }, support_flash_attn=True, - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='Qwen/Qwen-VL-Chat-Int4') @register_model( ModelType.qwen_14b_chat_int8, 'qwen/Qwen-14B-Chat-Int8', @@ -2444,7 +2680,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen-14B-Chat-Int8') @register_model( ModelType.qwen_7b_chat_int8, 'qwen/Qwen-7B-Chat-Int8', @@ -2453,7 +2690,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, function_kwargs={'bits': 8}, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Qwen/Qwen-7B-Chat-Int8') @register_model( ModelType.qwen_14b_chat_int4, 'qwen/Qwen-14B-Chat-Int4', @@ -2463,7 +2701,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-14B-Chat-Int4') @register_model( ModelType.qwen_7b_chat_int4, 'qwen/Qwen-7B-Chat-Int4', @@ -2473,7 +2712,8 @@ def get_model_tokenizer_qwen_audio(model_dir: str, torch_dtype=torch.float16, function_kwargs={'bits': 4}, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='Qwen/Qwen-7B-Chat-Int4') def get_model_tokenizer_qwen_intx(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2511,9 +2751,13 @@ def _new_forward(self, x): return model, tokenizer -register_model(ModelType.skywork_13b, 'skywork/Skywork-13B-base', - LoRATM.llama2, TemplateType.default_generation_bos, - get_model_tokenizer_from_repo) +register_model( + ModelType.skywork_13b, + 'skywork/Skywork-13B-base', + LoRATM.llama2, + TemplateType.default_generation_bos, + get_model_tokenizer_from_repo, + hf_model_id='Skywork/Skywork-13B-base') @register_model(ModelType.skywork_13b_chat, 'skywork/Skywork-13B-chat', @@ -2539,7 +2783,8 @@ def get_skywork_model_tokenizer(model_dir: str, TemplateType.codefuse_codellama, support_flash_attn=True, support_vllm=True, - tags=['coding']) + tags=['coding'], + hf_model_id='codefuse-ai/CodeFuse-CodeLlama-34B') def get_model_tokenizer_codellama(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2564,13 +2809,15 @@ def get_model_tokenizer_codellama(model_dir: str, support_flash_attn=True, support_vllm=True, support_gradient_checkpointing=False, - tags=['coding']) + tags=['coding'], + hf_model_id='microsoft/phi-2') @register_model( ModelType.telechat_12b, 'TeleAI/TeleChat-12B', LoRATM.telechat, TemplateType.telechat, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Tele-AI/TeleChat-12B') def get_model_tokenizer_phi(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2594,7 +2841,8 @@ def get_model_tokenizer_phi(model_dir: str, 'TeleAI/TeleChat-7B', LoRATM.telechat, TemplateType.telechat, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Tele-AI/telechat-7B') def get_model_tokenizer_telechat(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2624,14 +2872,16 @@ def get_model_tokenizer_telechat(model_dir: str, LoRATM.llama2, TemplateType.deepseek, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='deepseek-ai/deepseek-moe-16b-chat') @register_model( ModelType.deepseek_moe_16b, 'deepseek-ai/deepseek-moe-16b-base', LoRATM.llama2, TemplateType.default_generation_bos, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='deepseek-ai/deepseek-moe-16b-base') @register_model( ModelType.minicpm_moe_8x2b, 'OpenBMB/MiniCPM-MoE-8x2B', @@ -2639,7 +2889,8 @@ def get_model_tokenizer_telechat(model_dir: str, TemplateType.minicpm, requires=['transformers>=4.36.0'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='openbmb/MiniCPM-MoE-8x2B') def get_model_tokenizer_deepseek_moe(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2677,25 +2928,29 @@ def _new_forward(hidden_states, *, 'YuanLLM/Yuan2.0-2B-hf', LoRATM.llama2, TemplateType.yuan, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='IEITYuan/Yuan2-2B-hf') @register_model( ModelType.yuan2_51b_instruct, 'YuanLLM/Yuan2.0-51B-hf', LoRATM.llama2, TemplateType.yuan, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='IEITYuan/Yuan2-51B-hf') @register_model( ModelType.yuan2_102b_instruct, 'YuanLLM/Yuan2.0-102B-hf', LoRATM.llama2, TemplateType.yuan, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='IEITYuan/Yuan2-102B-hf') @register_model( ModelType.yuan2_2b_janus_instruct, 'YuanLLM/Yuan2-2B-Janus-hf', LoRATM.llama2, TemplateType.yuan, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='IEITYuan/Yuan2-2B-Janus-hf') def get_model_tokenizer_yuan(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2743,13 +2998,15 @@ def get_model_tokenizer_yuan(model_dir: str, 'OrionStarAI/Orion-14B-Base', LoRATM.llama2, TemplateType.default_generation, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='OrionStarAI/Orion-14B-Base') @register_model( ModelType.orion_14b_chat, 'OrionStarAI/Orion-14B-Chat', LoRATM.llama2, TemplateType.orion, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='OrionStarAI/Orion-14B-Chat') def get_model_tokenizer_orion(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2774,7 +3031,8 @@ def get_model_tokenizer_orion(model_dir: str, TemplateType.yi_vl, support_flash_attn=True, requires=['transformers>=4.34'], - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='01-ai/Yi-VL-34B') @register_model( ModelType.yi_vl_6b_chat, '01ai/Yi-VL-6B', @@ -2782,7 +3040,8 @@ def get_model_tokenizer_orion(model_dir: str, TemplateType.yi_vl, support_flash_attn=True, requires=['transformers>=4.34'], - tags=['multi-modal', 'vision']) + tags=['multi-modal', 'vision'], + hf_model_id='01-ai/Yi-VL-6B') def get_model_tokenizer_yi_vl(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2825,14 +3084,16 @@ def get_model_tokenizer_yi_vl(model_dir: str, LoRATM.llama2, TemplateType.minicpm, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='openbmb/MiniCPM-2B-sft-fp32') @register_model( ModelType.minicpm_2b_chat, 'OpenBMB/MiniCPM-2B-dpo-fp32', LoRATM.llama2, TemplateType.minicpm, support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='openbmb/MiniCPM-2B-dpo-fp32') @register_model( ModelType.minicpm_1b_sft_chat, 'OpenBMB/MiniCPM-1B-sft-bf16', @@ -2840,7 +3101,8 @@ def get_model_tokenizer_yi_vl(model_dir: str, TemplateType.minicpm, requires=['transformers>=4.36.0'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='openbmb/MiniCPM-1B-sft-bf16') @register_model( ModelType.minicpm_2b_128k, 'OpenBMB/MiniCPM-2B-128k', @@ -2848,7 +3110,8 @@ def get_model_tokenizer_yi_vl(model_dir: str, TemplateType.chatml, requires=['transformers>=4.36.0'], support_flash_attn=True, - support_vllm=True) + support_vllm=True, + hf_model_id='openbmb/MiniCPM-2B-128k') def get_model_tokenizer_minicpm(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2873,13 +3136,15 @@ def get_model_tokenizer_minicpm(model_dir: str, 'OpenBMB/MiniCPM-V', LoRATM.llama2, TemplateType.minicpm_v, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='openbmb/MiniCPM-V') @register_model( ModelType.minicpm_v_v2, 'OpenBMB/MiniCPM-V-2', LoRATM.llama2, TemplateType.minicpm_v, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='openbmb/MiniCPM-V-2') def get_model_tokenizer_minicpm_v(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2911,6 +3176,16 @@ def _new_generate(inputs=None, *args, **kwargs): model.generate = _new_generate +@register_model( + ModelType.llava1d6_yi_34b_instruct, + 'AI-ModelScope/llava-v1.6-34b', + LoRATM.llama2, + TemplateType.llava_yi_instruct, + eos_token='<|im_end|>', + support_flash_attn=True, + function_kwargs={'llm_model_type': 'llama'}, + tags=['multi-modal', 'vision'], + hf_model_id='liuhaotian/llava-v1.6-34b') @register_model( ModelType.llava1d6_mistral_7b_instruct, 'AI-ModelScope/llava-v1.6-mistral-7b', @@ -2918,7 +3193,9 @@ def _new_generate(inputs=None, *args, **kwargs): TemplateType.llava_mistral_instruct, requires=['transformers>=4.34'], support_flash_attn=True, - tags=['multi-modal', 'vision']) + function_kwargs={'llm_model_type': 'mistral'}, + tags=['multi-modal', 'vision'], + hf_model_id='liuhaotian/llava-v1.6-mistral-7b') def get_model_tokenizer_llava(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2928,8 +3205,26 @@ def get_model_tokenizer_llava(model_dir: str, 'https://github.com/haotian-liu/LLaVA.git') sys.path.append(os.path.join(local_repo_path)) - from llava.model import LlavaMistralForCausalLM, LlavaMistralConfig - model_config = LlavaMistralConfig.from_pretrained(model_dir) + llm_model_type = kwargs.pop('llm_model_type') + if llm_model_type == 'mistral': + from llava.model import LlavaMistralForCausalLM, LlavaMistralConfig + model_config = LlavaMistralConfig.from_pretrained(model_dir) + automodel_class = LlavaMistralForCausalLM + else: # llama + from llava.model import LlavaLlamaForCausalLM, LlavaConfig + if not hasattr(LlavaLlamaForCausalLM, + '__old_forward'): # Avoid double patching + forward = LlavaLlamaForCausalLM.forward + LlavaLlamaForCausalLM.__old_forward = forward + + @wraps(forward) + def _new_forward(*args, **kwargs): + kwargs.pop('cache_position', None) + return forward(*args, **kwargs) + + LlavaLlamaForCausalLM.forward = _new_forward + model_config = LlavaConfig.from_pretrained(model_dir) + automodel_class = LlavaLlamaForCausalLM model_config.mm_vision_tower = snapshot_download( 'AI-ModelScope/clip-vit-large-patch14-336') model, tokenizer = get_model_tokenizer_with_flash_attn( @@ -2938,7 +3233,7 @@ def get_model_tokenizer_llava(model_dir: str, model_kwargs, load_model, model_config=model_config, - automodel_class=LlavaMistralForCausalLM, + automodel_class=automodel_class, **kwargs) model.resize_token_embeddings(len(tokenizer)) @@ -2952,55 +3247,6 @@ def get_model_tokenizer_llava(model_dir: str, return model, tokenizer -@register_model( - ModelType.llava1d6_yi_34b_instruct, - 'AI-ModelScope/llava-v1.6-34b', - LoRATM.llama2, - TemplateType.llava_yi_instruct, - eos_token='<|im_end|>', - support_flash_attn=True, - tags=['multi-modal', 'vision']) -def get_model_tokenizer_llava_34b(model_dir: str, - torch_dtype: Dtype, - model_kwargs: Dict[str, Any], - load_model: bool = True, - **kwargs): - local_repo_path = _git_clone_github( - 'https://github.com/haotian-liu/LLaVA.git') - sys.path.append(os.path.join(local_repo_path)) - - from llava.model import LlavaLlamaForCausalLM, LlavaConfig - forward = LlavaLlamaForCausalLM.forward - LlavaLlamaForCausalLM.__old_forward = forward - - @wraps(forward) - def _new_forward(*args, **kwargs): - kwargs.pop('cache_position', None) - return forward(*args, **kwargs) - - LlavaLlamaForCausalLM.forward = _new_forward - model_config = LlavaConfig.from_pretrained(model_dir) - model_config.mm_vision_tower = snapshot_download( - 'AI-ModelScope/clip-vit-large-patch14-336') - model, tokenizer = get_model_tokenizer_with_flash_attn( - model_dir, - torch_dtype, - model_kwargs, - load_model, - model_config=model_config, - automodel_class=LlavaLlamaForCausalLM, - **kwargs) - model.resize_token_embeddings(len(tokenizer)) - vision_tower = model.get_vision_tower() - device_map = str(model_kwargs.get('device_map', str(model.device))) - if not vision_tower.is_loaded: - vision_tower.load_model(device_map=device_map) - if not hasattr(model.config, 'max_sequence_length'): - model.config.max_sequence_length = 2048 - _patch_llava(model) - return model, tokenizer - - @register_model( ModelType.mplug_owl2_chat, 'iic/mPLUG-Owl2', @@ -3011,7 +3257,8 @@ def _new_forward(*args, **kwargs): function_kwargs={ 'get_model_tokenizer_function': get_model_tokenizer_with_flash_attn }, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='MAGAer13/mplug-owl2-llama2-7b') @register_model( ModelType.mplug_owl2d1_chat, 'iic/mPLUG-Owl2.1', @@ -3023,7 +3270,8 @@ def _new_forward(*args, **kwargs): 'vocab_size': 151851, 'get_model_tokenizer_function': get_model_tokenizer_qwen }, - support_flash_attn=True) + support_flash_attn=True, + hf_model_id='Mizukiluke/mplug_owl_2_1') def get_model_tokenizer_mplug_owl2(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -3109,7 +3357,7 @@ def safe_snapshot_download(model_type: str, dist.barrier() if model_id_or_path is not None and not os.path.exists(model_id_or_path): revision = model_info['revision'] - use_hf = model_info['use_hf'] + use_hf = strtobool(os.environ.get('USE_HF', 'False')) ignore_file_pattern = model_info['ignore_file_pattern'] if use_hf: logger.info( From dcf38555d98a0f7df3fd78fa0f311055349a4ab6 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 17:20:55 +0800 Subject: [PATCH 03/26] fix qwen-vl bnb bug --- swift/llm/utils/model.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 2cf09e95c9..6d4c86842b 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -2527,6 +2527,15 @@ def get_model_tokenizer_qwen_vl(model_dir: str, model_kwargs['quantization_config'].llm_int8_skip_modules = [ 'lm_head', 'attn_pool.attn' ] + _TransformerBlock = get_class_from_dynamic_module( + 'visual.TransformerBlock', model_dir) + + def _get_cast_dtype(self) -> torch.dtype: + return self.resblocks[0].ln_1.weight.dtype + + _TransformerBlock.__old_get_cast_dtype = _TransformerBlock.get_cast_dtype + _TransformerBlock.get_cast_dtype = _get_cast_dtype + get_qwen_function = kwargs.pop('get_qwen_function', get_model_tokenizer_qwen_chat) tokenizer_config = get_tokenizer_config(model_dir) From 693082f2aba98ce558cc423db98e9e1dadefd0e7 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 17:51:21 +0800 Subject: [PATCH 04/26] fix gptq quant bug --- swift/llm/export.py | 2 +- swift/llm/infer.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/swift/llm/export.py b/swift/llm/export.py index 09a701cdd3..0fe892b8e7 100644 --- a/swift/llm/export.py +++ b/swift/llm/export.py @@ -145,7 +145,7 @@ def gptq_model_quantize(model, tokenizer): global _args logger.info(f'Quantization dataset: {_args.dataset}') gptq_quantizer = GPTQQuantizer( - bits=_args.quant_bits, dataset=_args.dataset) + bits=_args.quant_bits, dataset=','.join(_args.dataset)) _origin_get_dataset = quantizer.get_dataset quantizer.get_dataset = _get_dataset logger.info('Start quantizing the model...') diff --git a/swift/llm/infer.py b/swift/llm/infer.py index 88d178cc3d..b6c86e91f7 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -107,10 +107,13 @@ def merge_lora(args: InferArguments, model_kwargs = {} if is_torch_npu_available(): logger.info(f'device_count: {torch.npu.device_count()}') - device_map = 'npu:0' + if device_map is None: + device_map = 'npu:0' else: logger.info(f'device_count: {torch.cuda.device_count()}') - device_map = 'auto' + if device_map is None: + device_map = 'auto' + if device_map == 'auto': model_kwargs['low_cpu_mem_usage'] = True model_kwargs['device_map'] = device_map @@ -150,10 +153,13 @@ def prepare_model_template( model_kwargs = {} if is_torch_npu_available(): logger.info(f'device_count: {torch.npu.device_count()}') - device_map = 'npu:0' + if device_map is None: + device_map = 'npu:0' else: logger.info(f'device_count: {torch.cuda.device_count()}') - device_map = 'auto' + if device_map is None: + device_map = 'auto' + if device_map == 'auto': model_kwargs['low_cpu_mem_usage'] = True model_kwargs['device_map'] = device_map From 96772ad76187ea8a4fe4710e6064b50bb9227bda Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 18:36:00 +0800 Subject: [PATCH 05/26] fix gptq quant bug --- docs/source/Multi-Modal/index.md | 12 +-- swift/llm/dpo.py | 3 +- swift/llm/export.py | 2 +- swift/llm/infer.py | 14 +++- swift/llm/utils/model.py | 123 +++++++++++++++---------------- 5 files changed, 79 insertions(+), 75 deletions(-) diff --git a/docs/source/Multi-Modal/index.md b/docs/source/Multi-Modal/index.md index b25b49e3e1..50475ace57 100644 --- a/docs/source/Multi-Modal/index.md +++ b/docs/source/Multi-Modal/index.md @@ -5,9 +5,9 @@ 1. [Qwen-VL最佳实践](qwen-vl最佳实践.md) 2. [Qwen-Audio最佳实践](qwen-audio最佳实践.md) 3. [Llava最佳实践](llava最佳实践.md) -4. [Deepseek-VL最佳实践](../Multi-Modal/deepseek-vl最佳实践.md) -5. [Yi-VL最佳实践.md](../Multi-Modal/yi-vl最佳实践.md) -6. [Internlm2-Xcomposers最佳实践](../Multi-Modal/internlm-xcomposer2最佳实践.md) -7. [MiniCPM-V最佳实践](../Multi-Modal/minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](../Multi-Modal/minicpm-v-2最佳实践.md) -8. [CogVLM最佳实践](../Multi-Modal/cogvlm最佳实践.md) -9. [mPLUG-Owl2最佳实践](../Multi-Modal/mplug-owl2最佳实践.md) +4. [Deepseek-VL最佳实践](deepseek-vl最佳实践.md) +5. [Yi-VL最佳实践.md](yi-vl最佳实践.md) +6. [Internlm2-Xcomposers最佳实践](internlm-xcomposer2最佳实践.md) +7. [MiniCPM-V最佳实践](minicpm-v最佳实践.md), [MiniCPM-V-2最佳实践](minicpm-v-2最佳实践.md) +8. [CogVLM最佳实践](cogvlm最佳实践.md) +9. [mPLUG-Owl2最佳实践](mplug-owl2最佳实践.md) diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py index a43a46a724..58e6de18cc 100644 --- a/swift/llm/dpo.py +++ b/swift/llm/dpo.py @@ -12,10 +12,9 @@ from swift.utils import (check_json_format, get_dist_setting, get_logger, get_main, get_model_info, is_ddp_plus_mp, is_dist, is_master, plot_images, seed_everything, show_layers) -from . import get_time_info from .tuner import prepare_model from .utils import (DPOArguments, Template, get_dataset, get_model_tokenizer, - get_template, set_generation_config) + get_template, get_time_info, set_generation_config) logger = get_logger() diff --git a/swift/llm/export.py b/swift/llm/export.py index 09a701cdd3..0fe892b8e7 100644 --- a/swift/llm/export.py +++ b/swift/llm/export.py @@ -145,7 +145,7 @@ def gptq_model_quantize(model, tokenizer): global _args logger.info(f'Quantization dataset: {_args.dataset}') gptq_quantizer = GPTQQuantizer( - bits=_args.quant_bits, dataset=_args.dataset) + bits=_args.quant_bits, dataset=','.join(_args.dataset)) _origin_get_dataset = quantizer.get_dataset quantizer.get_dataset = _get_dataset logger.info('Start quantizing the model...') diff --git a/swift/llm/infer.py b/swift/llm/infer.py index 88d178cc3d..b6c86e91f7 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -107,10 +107,13 @@ def merge_lora(args: InferArguments, model_kwargs = {} if is_torch_npu_available(): logger.info(f'device_count: {torch.npu.device_count()}') - device_map = 'npu:0' + if device_map is None: + device_map = 'npu:0' else: logger.info(f'device_count: {torch.cuda.device_count()}') - device_map = 'auto' + if device_map is None: + device_map = 'auto' + if device_map == 'auto': model_kwargs['low_cpu_mem_usage'] = True model_kwargs['device_map'] = device_map @@ -150,10 +153,13 @@ def prepare_model_template( model_kwargs = {} if is_torch_npu_available(): logger.info(f'device_count: {torch.npu.device_count()}') - device_map = 'npu:0' + if device_map is None: + device_map = 'npu:0' else: logger.info(f'device_count: {torch.cuda.device_count()}') - device_map = 'auto' + if device_map is None: + device_map = 'auto' + if device_map == 'auto': model_kwargs['low_cpu_mem_usage'] = True model_kwargs['device_map'] = device_map diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 0b5c2cb9a4..bceba5a4d9 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -1214,6 +1214,13 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True) +@register_model( + ModelType.yi_9b_200k, + '01ai/Yi-9B-200K', + LoRATM.llama2, + TemplateType.default_generation, + support_flash_attn=True, + support_vllm=True) @register_model( ModelType.yi_6b, '01ai/Yi-6B', @@ -1452,6 +1459,15 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq']) +@register_model( + ModelType.qwen1half_32b_chat_awq, + 'qwen/Qwen1.5-32B-Chat-AWQ', + LoRATM.qwen1half, + TemplateType.qwen, + support_flash_attn=True, + support_vllm=True, + function_kwargs={'is_awq': True}, + requires=['transformers>=4.37', 'autoawq']) @register_model( ModelType.qwen1half_72b_chat_awq, 'qwen/Qwen1.5-72B-Chat-AWQ', @@ -2299,6 +2315,15 @@ def get_model_tokenizer_qwen_vl(model_dir: str, model_kwargs['quantization_config'].llm_int8_skip_modules = [ 'lm_head', 'attn_pool.attn' ] + _TransformerBlock = get_class_from_dynamic_module( + 'visual.TransformerBlock', model_dir) + + def _get_cast_dtype(self) -> torch.dtype: + return self.resblocks[0].ln_1.weight.dtype + + _TransformerBlock.__old_get_cast_dtype = _TransformerBlock.get_cast_dtype + _TransformerBlock.get_cast_dtype = _get_cast_dtype + get_qwen_function = kwargs.pop('get_qwen_function', get_model_tokenizer_qwen_chat) tokenizer_config = get_tokenizer_config(model_dir) @@ -2911,6 +2936,16 @@ def _new_generate(inputs=None, *args, **kwargs): model.generate = _new_generate +@register_model( + ModelType.llava1d6_yi_34b_instruct, + 'AI-ModelScope/llava-v1.6-34b', + LoRATM.llama2, + TemplateType.llava_yi_instruct, + eos_token='<|im_end|>', + support_flash_attn=True, + function_kwargs={'llm_model_type': 'llama'}, + tags=['multi-modal', 'vision'], + hf_model_id='liuhaotian/llava-v1.6-34b') @register_model( ModelType.llava1d6_mistral_7b_instruct, 'AI-ModelScope/llava-v1.6-mistral-7b', @@ -2918,7 +2953,9 @@ def _new_generate(inputs=None, *args, **kwargs): TemplateType.llava_mistral_instruct, requires=['transformers>=4.34'], support_flash_attn=True, - tags=['multi-modal', 'vision']) + function_kwargs={'llm_model_type': 'mistral'}, + tags=['multi-modal', 'vision'], + hf_model_id='liuhaotian/llava-v1.6-mistral-7b') def get_model_tokenizer_llava(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], @@ -2928,76 +2965,38 @@ def get_model_tokenizer_llava(model_dir: str, 'https://github.com/haotian-liu/LLaVA.git') sys.path.append(os.path.join(local_repo_path)) - from llava.model import LlavaMistralForCausalLM, LlavaMistralConfig - model_config = LlavaMistralConfig.from_pretrained(model_dir) + llm_model_type = kwargs.pop('llm_model_type') + if llm_model_type == 'mistral': + from llava.model import LlavaMistralForCausalLM, LlavaMistralConfig + model_config = LlavaMistralConfig.from_pretrained(model_dir) + automodel_class = LlavaMistralForCausalLM + else: # llama + from llava.model import LlavaLlamaForCausalLM, LlavaConfig + if not hasattr(LlavaLlamaForCausalLM, + '__old_forward'): # Avoid double patching + forward = LlavaLlamaForCausalLM.forward + LlavaLlamaForCausalLM.__old_forward = forward + + @wraps(forward) + def _new_forward(*args, **kwargs): + kwargs.pop('cache_position', None) + return forward(*args, **kwargs) + + LlavaLlamaForCausalLM.forward = _new_forward + model_config = LlavaConfig.from_pretrained(model_dir) + automodel_class = LlavaLlamaForCausalLM model_config.mm_vision_tower = snapshot_download( 'AI-ModelScope/clip-vit-large-patch14-336') model, tokenizer = get_model_tokenizer_with_flash_attn( - model_dir, - torch_dtype, + @@ -2938,7 +3242,7 @@ def get_model_tokenizer_llava(model_dir: str, model_kwargs, load_model, model_config=model_config, - automodel_class=LlavaMistralForCausalLM, + automodel_class=automodel_class, **kwargs) model.resize_token_embeddings(len(tokenizer)) - vision_tower = model.get_vision_tower() - device_map = str(model_kwargs.get('device_map', str(model.device))) - if not vision_tower.is_loaded: - vision_tower.load_model(device_map=device_map) - if not hasattr(model.config, 'max_sequence_length'): - model.config.max_sequence_length = 2048 - _patch_llava(model) - return model, tokenizer - - -@register_model( - ModelType.llava1d6_yi_34b_instruct, - 'AI-ModelScope/llava-v1.6-34b', - LoRATM.llama2, - TemplateType.llava_yi_instruct, - eos_token='<|im_end|>', - support_flash_attn=True, - tags=['multi-modal', 'vision']) -def get_model_tokenizer_llava_34b(model_dir: str, - torch_dtype: Dtype, - model_kwargs: Dict[str, Any], - load_model: bool = True, - **kwargs): - local_repo_path = _git_clone_github( - 'https://github.com/haotian-liu/LLaVA.git') - sys.path.append(os.path.join(local_repo_path)) - - from llava.model import LlavaLlamaForCausalLM, LlavaConfig - forward = LlavaLlamaForCausalLM.forward - LlavaLlamaForCausalLM.__old_forward = forward - - @wraps(forward) - def _new_forward(*args, **kwargs): - kwargs.pop('cache_position', None) - return forward(*args, **kwargs) - - LlavaLlamaForCausalLM.forward = _new_forward - model_config = LlavaConfig.from_pretrained(model_dir) - model_config.mm_vision_tower = snapshot_download( - 'AI-ModelScope/clip-vit-large-patch14-336') - model, tokenizer = get_model_tokenizer_with_flash_attn( - model_dir, - torch_dtype, - model_kwargs, - load_model, - model_config=model_config, - automodel_class=LlavaLlamaForCausalLM, - **kwargs) - model.resize_token_embeddings(len(tokenizer)) - vision_tower = model.get_vision_tower() - device_map = str(model_kwargs.get('device_map', str(model.device))) - if not vision_tower.is_loaded: - vision_tower.load_model(device_map=device_map) - if not hasattr(model.config, 'max_sequence_length'): - model.config.max_sequence_length = 2048 - _patch_llava(model) + @@ -2952,55 +3256,6 @@ def get_model_tokenizer_llava(model_dir: str, return model, tokenizer From 48f587c199aec37303f84753a9758be7c7536b5f Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 18:42:50 +0800 Subject: [PATCH 06/26] fix bug --- swift/llm/utils/model.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index bceba5a4d9..9d696bb558 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -2988,7 +2988,8 @@ def _new_forward(*args, **kwargs): model_config.mm_vision_tower = snapshot_download( 'AI-ModelScope/clip-vit-large-patch14-336') model, tokenizer = get_model_tokenizer_with_flash_attn( - @@ -2938,7 +3242,7 @@ def get_model_tokenizer_llava(model_dir: str, + model_dir, + torch_dtype, model_kwargs, load_model, model_config=model_config, @@ -2996,7 +2997,13 @@ def _new_forward(*args, **kwargs): **kwargs) model.resize_token_embeddings(len(tokenizer)) - @@ -2952,55 +3256,6 @@ def get_model_tokenizer_llava(model_dir: str, + vision_tower = model.get_vision_tower() + device_map = str(model_kwargs.get('device_map', str(model.device))) + if not vision_tower.is_loaded: + vision_tower.load_model(device_map=device_map) + if not hasattr(model.config, 'max_sequence_length'): + model.config.max_sequence_length = 2048 + _patch_llava(model) return model, tokenizer From 4a68e761cab7ca2d282e055f2448fa0398e23ecf Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 18:48:26 +0800 Subject: [PATCH 07/26] fix bug --- swift/llm/utils/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 9d696bb558..805586b6dc 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -95,6 +95,7 @@ class ModelType: qwen1half_4b_chat_awq = 'qwen1half-4b-chat-awq' qwen1half_7b_chat_awq = 'qwen1half-7b-chat-awq' qwen1half_14b_chat_awq = 'qwen1half-14b-chat-awq' + qwen1half_32b_chat_awq = 'qwen1half-32b-chat-awq' qwen1half_72b_chat_awq = 'qwen1half-72b-chat-awq' # qwen-vl @@ -127,6 +128,7 @@ class ModelType: yi_6b_200k = 'yi-6b-200k' yi_6b_chat = 'yi-6b-chat' yi_9b = 'yi-9b' + yi_9b_200k = 'yi-9b-200k' yi_34b = 'yi-34b' yi_34b_200k = 'yi-34b-200k' yi_34b_chat = 'yi-34b-chat' From f51d954f05aec40aa6828046254e676f98db58f6 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 18:50:42 +0800 Subject: [PATCH 08/26] update --- swift/llm/utils/model.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index bd48f18b24..6d4c86842b 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -1343,13 +1343,6 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True, hf_model_id='01-ai/Yi-9B-200K') -@register_model( - ModelType.yi_9b_200k, - '01ai/Yi-9B-200K', - LoRATM.llama2, - TemplateType.default_generation, - support_flash_attn=True, - support_vllm=True) @register_model( ModelType.yi_6b, '01ai/Yi-6B', @@ -1624,15 +1617,6 @@ def get_model_tokenizer_aqlm(model_dir: str, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], hf_model_id='Qwen/Qwen1.5-32B-Chat-AWQ') -@register_model( - ModelType.qwen1half_32b_chat_awq, - 'qwen/Qwen1.5-32B-Chat-AWQ', - LoRATM.qwen1half, - TemplateType.qwen, - support_flash_attn=True, - support_vllm=True, - function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) @register_model( ModelType.qwen1half_72b_chat_awq, 'qwen/Qwen1.5-72B-Chat-AWQ', From 4fc4aae270a6cc792320c1ae6edb44fdc766171d Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 19:00:26 +0800 Subject: [PATCH 09/26] fix --- swift/llm/utils/model.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index bd48f18b24..6d4c86842b 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -1343,13 +1343,6 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True, hf_model_id='01-ai/Yi-9B-200K') -@register_model( - ModelType.yi_9b_200k, - '01ai/Yi-9B-200K', - LoRATM.llama2, - TemplateType.default_generation, - support_flash_attn=True, - support_vllm=True) @register_model( ModelType.yi_6b, '01ai/Yi-6B', @@ -1624,15 +1617,6 @@ def get_model_tokenizer_aqlm(model_dir: str, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], hf_model_id='Qwen/Qwen1.5-32B-Chat-AWQ') -@register_model( - ModelType.qwen1half_32b_chat_awq, - 'qwen/Qwen1.5-32B-Chat-AWQ', - LoRATM.qwen1half, - TemplateType.qwen, - support_flash_attn=True, - support_vllm=True, - function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) @register_model( ModelType.qwen1half_72b_chat_awq, 'qwen/Qwen1.5-72B-Chat-AWQ', From 36adefdc2a29501cb08cdd28725b53a10c7ff72a Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 16 Apr 2024 21:32:36 +0800 Subject: [PATCH 10/26] update revision --- swift/llm/dpo.py | 2 ++ swift/llm/export.py | 1 + swift/llm/infer.py | 4 +++- swift/llm/sft.py | 1 + swift/llm/utils/argument.py | 10 ++++++---- swift/llm/utils/model.py | 21 ++++++++++++++++----- swift/llm/utils/utils.py | 31 +++++++++++++++++-------------- swift/llm/utils/vllm_utils.py | 4 +++- 8 files changed, 49 insertions(+), 25 deletions(-) diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py index 58e6de18cc..56718f18b3 100644 --- a/swift/llm/dpo.py +++ b/swift/llm/dpo.py @@ -55,6 +55,7 @@ def llm_dpo(args: DPOArguments) -> str: args.torch_dtype, model_kwargs, model_id_or_path=args.model_id_or_path, + revision=args.model_revision, **kwargs) if args.ref_model_type is not None: ref_model, _ = get_model_tokenizer( @@ -62,6 +63,7 @@ def llm_dpo(args: DPOArguments) -> str: args.torch_dtype, model_kwargs, model_id_or_path=args.ref_model_id_or_path, + revision=args.model_revision, **kwargs) else: ref_model = None diff --git a/swift/llm/export.py b/swift/llm/export.py index 0fe892b8e7..3ecc282f08 100644 --- a/swift/llm/export.py +++ b/swift/llm/export.py @@ -38,6 +38,7 @@ def prepare_awq_model_template( args.torch_dtype, model_kwargs, model_id_or_path=model_id_or_path, + revision=args.model_revision, automodel_class=AutoAWQForCausalLM) logger.info(f'model_config: {model.config}') generation_config = GenerationConfig( diff --git a/swift/llm/infer.py b/swift/llm/infer.py index b6c86e91f7..e34177a2ad 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -125,7 +125,8 @@ def merge_lora(args: InferArguments, args.model_type, args.torch_dtype, model_kwargs, - model_id_or_path=model_id_or_path) + model_id_or_path=model_id_or_path, + revision=args.model_revision) logger.info(f'model_config: {model.config}') # Preparing LoRA @@ -189,6 +190,7 @@ def prepare_model_template( args.torch_dtype, model_kwargs, model_id_or_path=model_id_or_path, + revision=args.model_revision, **kwargs) logger.info(f'model_config: {model.config}') if model.max_model_len is None: diff --git a/swift/llm/sft.py b/swift/llm/sft.py index e69bd7cfd8..67d8ecfaac 100644 --- a/swift/llm/sft.py +++ b/swift/llm/sft.py @@ -78,6 +78,7 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]: args.torch_dtype, model_kwargs, model_id_or_path=args.model_id_or_path, + revision=args.model_revision, is_training=True, **kwargs) logger.info(f'model_config: {model.config}') diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index 69dbd3d327..3447abce36 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -14,7 +14,7 @@ from torch import dtype as Dtype from transformers.utils import (is_torch_bf16_gpu_available, is_torch_cuda_available, - is_torch_npu_available) + is_torch_npu_available, strtobool) from transformers.utils.versions import require_version from swift.hub import HubApi, ModelScopeConfig @@ -1044,11 +1044,13 @@ def set_model_type(args: Union[SftArguments, InferArguments]) -> None: raise ValueError(f"model_type: '{args.model_type}' is not registered. " + error_msg) model_info = MODEL_MAPPING[args.model_type] - if args.model_revision is None: - args.model_revision = model_info['revision'] - else: + use_hf = strtobool(os.environ.get('USE_HF', 'False')) + if args.model_revision is not None: model_info['revision'] = args.model_revision logger.info(f"Setting model_info['revision']: {args.model_revision}") + elif use_hf: + model_info['revision'] = 'main' + args.model_revision = model_info['revision'] if args.model_id_or_path is None: args.model_id_or_path = model_info['model_id_or_path'] requires = model_info['requires'] diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 6d4c86842b..07e56f3437 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -359,7 +359,7 @@ def register_model( requires: Optional[List[str]] = None, torch_dtype: Optional[Dtype] = None, hf_model_id: Optional[str] = None, - revision: Optional[str] = None, + revision: Optional[str] = None, # only modelscope ignore_file_pattern: Optional[List[str]] = None, function_kwargs: Optional[Dict[str, Any]] = None, exists_ok: bool = False, @@ -3352,26 +3352,34 @@ def fix_gradient_checkpointing_warning() -> None: def safe_snapshot_download(model_type: str, model_id_or_path: Optional[str] = None, + revision: Optional[str] = None, **kwargs) -> str: # Perform snapshot_download (ms or hf) based on model_type and model_id_or_path. model_info = MODEL_MAPPING[model_type] + use_hf = strtobool(os.environ.get('USE_HF', 'False')) if model_id_or_path is None: model_dir = kwargs.pop('model_dir', None) # compat with swift<1.7 if model_dir is not None: model_id_or_path = model_dir else: - model_id_or_path = model_info['model_id_or_path'] + model_id_or_path = model_info[ + 'hf_model_id' if use_hf else 'model_id_or_path'] if is_dist() and not is_local_master(): dist.barrier() if model_id_or_path is not None and not os.path.exists(model_id_or_path): - revision = model_info['revision'] - use_hf = strtobool(os.environ.get('USE_HF', 'False')) ignore_file_pattern = model_info['ignore_file_pattern'] if use_hf: + if revision is None: + revision = 'main' logger.info( f'Downloading the model from HuggingFace Hub, model_id: {model_id_or_path}' ) + use_hf_transfer = strtobool( + os.environ.get('USE_HF_TRANSFER', 'False')) + if use_hf_transfer: + import huggingface_hub._snapshot_download as hf_s + hf_s.HF_HUB_ENABLE_HF_TRANSFER = True from huggingface_hub import snapshot_download as hf_snapshot_download model_dir = hf_snapshot_download( model_id_or_path, @@ -3379,6 +3387,8 @@ def safe_snapshot_download(model_type: str, revision=revision, ignore_patterns=ignore_file_pattern) else: + if revision is None: + revision = model_info['revision'] logger.info( f'Downloading the model from ModelScope Hub, model_id: {model_id_or_path}' ) @@ -3414,6 +3424,7 @@ def get_model_tokenizer( load_model: bool = True, *, model_id_or_path: Optional[str] = None, + revision: Optional[str] = None, **kwargs) -> Tuple[Optional[PreTrainedModel], PreTrainedTokenizerBase]: """ torch_dtype: If you use None, it will retrieve the torch_dtype from the config.json file. @@ -3421,7 +3432,7 @@ def get_model_tokenizer( """ model_dir = kwargs.pop('model_dir', None) # compat with swift<1.7 model_dir = safe_snapshot_download( - model_type, model_id_or_path, model_dir=model_dir) + model_type, model_id_or_path, revision=revision, model_dir=model_dir) model_info = MODEL_MAPPING[model_type] requires = model_info['requires'] diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py index b10eeebacf..f036b1760d 100644 --- a/swift/llm/utils/utils.py +++ b/swift/llm/utils/utils.py @@ -25,7 +25,6 @@ from accelerate.utils.modeling import (get_balanced_memory, infer_auto_device_map) from datasets import Dataset as HfDataset -from modelscope import MsDataset from modelscope.utils.config_ds import MS_CACHE_HOME from modelscope.utils.logger import get_logger as get_ms_logger from torch import device as Device @@ -37,6 +36,7 @@ PreTrainedTokenizerBase, StoppingCriteriaList, TextStreamer, trainer) from transformers.generation.streamers import BaseStreamer +from transformers.utils import strtobool from swift.hub import ModelScopeConfig from swift.tuners.module_mapping import MODEL_KEYS_MAPPING @@ -92,20 +92,25 @@ def download_dataset(model_id: str, return local_dir -_old_msdataset_load = MsDataset.load +use_hf = strtobool(os.environ.get('USE_HF', 'False')) +if not use_hf: + from modelscope import MsDataset + _old_msdataset_load = MsDataset.load + @wraps(_old_msdataset_load) + def _msdataset_ddp_load(*args, **kwargs): + if is_dist() and not is_local_master(): + dist.barrier() + dataset = _old_msdataset_load(*args, **kwargs) + if is_dist() and is_local_master(): + dist.barrier() -@wraps(_old_msdataset_load) -def _msdataset_ddp_load(*args, **kwargs): - if is_dist() and not is_local_master(): - dist.barrier() - dataset = _old_msdataset_load(*args, **kwargs) - if is_dist() and is_local_master(): - dist.barrier() + if is_dist(): + dist.barrier() + return dataset - if is_dist(): - dist.barrier() - return dataset + # monkey patching + MsDataset.load = _msdataset_ddp_load def _get_max_memory(device_ids: List[int]) -> Dict[Union[int, str], int]: @@ -899,8 +904,6 @@ def get_max_model_len(config: PretrainedConfig) -> Optional[int]: return max_model_len -# monkey patching -MsDataset.load = _msdataset_ddp_load if is_ddp_plus_mp(): _old_ddp_init = DDP.__init__ accelerate.accelerator.torch.nn.parallel.DistributedDataParallel.__init__ = ( diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py index 16aed7d3df..4e8b90e7b1 100644 --- a/swift/llm/utils/vllm_utils.py +++ b/swift/llm/utils/vllm_utils.py @@ -33,6 +33,7 @@ def get_vllm_engine(model_type: str, torch_dtype: Optional[Dtype] = None, *, model_id_or_path: Optional[str] = None, + revision: Optional[str] = None, gpu_memory_utilization: float = 0.9, tensor_parallel_size: int = 1, max_model_len: Optional[int] = None, @@ -47,7 +48,8 @@ def get_vllm_engine(model_type: str, model_type, load_model=False, model_id_or_path=model_id_or_path, - model_dir=model_dir)[1] + model_dir=model_dir, + revision=revision)[1] model_dir = tokenizer.model_dir if engine_kwargs is None: From 2c812268749676afc67f44501ae83b960596ffa1 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 17 Apr 2024 16:22:09 +0800 Subject: [PATCH 11/26] update dataset --- ...56\350\260\203\346\226\207\346\241\243.md" | 4 + swift/llm/export.py | 2 +- swift/llm/utils/dataset.py | 130 +++++++++++++----- 3 files changed, 99 insertions(+), 37 deletions(-) diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" index 8185ed0de4..2d38e22f60 100644 --- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" +++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" @@ -7,6 +7,7 @@ - [量化](#量化) - [推理](#推理) - [Web-UI](#web-ui) +- [推送模型](#推送模型) ## 环境准备 GPU设备: A10, 3090, V100, A100均可. @@ -287,3 +288,6 @@ CUDA_VISIBLE_DEVICES=0 swift export \ CUDA_VISIBLE_DEVICES=0 swift app-ui --ckpt_dir 'xxx/vx-xxx/checkpoint-xxx-merged' ``` + +## 推送模型 +如果你想推送模型到ModelScope,可以参考[模型推送文档](LLM量化文档.md#推送模型) diff --git a/swift/llm/export.py b/swift/llm/export.py index 3ecc282f08..f4fb58e8e0 100644 --- a/swift/llm/export.py +++ b/swift/llm/export.py @@ -181,7 +181,7 @@ def llm_export(args: ExportArguments) -> None: ckpt_dir, f'{ckpt_name}-{args.quant_method}-int{args.quant_bits}') logger.info(f'Setting quant_path: {quant_path}') - assert not os.path.exists(quant_path) + assert not os.path.exists(quant_path), f'quant_path: {quant_path}' if args.quant_method == 'awq': awq_model, template = prepare_awq_model_template(args) awq_model_quantize(awq_model, template.tokenizer) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 7ac363aaa0..fcfde43c68 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -9,11 +9,11 @@ import numpy as np import pandas as pd from datasets import Dataset as HfDataset -from datasets import concatenate_datasets -from modelscope import MsDataset +from datasets import concatenate_datasets, load_dataset from numpy.random import RandomState from pandas import DataFrame from tqdm.auto import tqdm +from transformers.utils import strtobool from swift.utils import (get_logger, get_seed, read_from_jsonl, transform_jsonl_to_df) @@ -179,6 +179,7 @@ def register_dataset( preprocess_func: Optional[PreprocessFunc] = None, get_function: Optional[GetDatasetFunction] = None, *, + hf_dataset_id: Optional[str] = None, function_kwargs: Optional[Dict[str, Any]] = None, exists_ok: bool = False, **kwargs @@ -201,6 +202,7 @@ def register_dataset( 'train_subset_split_list': train_subset_split_list, 'val_subset_split_list': val_subset_split_list, 'preprocess_func': preprocess_func, + 'hf_dataset_id': hf_dataset_id, **kwargs } if get_function is not None: @@ -225,6 +227,7 @@ def _register_dataset( def load_ms_dataset( dataset_id: str, subset_split_list: Optional[List[SubsetSplit]]) -> Optional[HfDataset]: + from modelscope import MsDataset if subset_split_list is None or len(subset_split_list) == 0: return None dataset_list = [] @@ -241,14 +244,34 @@ def load_ms_dataset( return concatenate_datasets(dataset_list) +def load_hf_dataset( + dataset_id: str, + subset_split_list: Optional[List[SubsetSplit]]) -> Optional[HfDataset]: + if subset_split_list is None or len(subset_split_list) == 0: + return None + dataset_list = [] + for subset_split in subset_split_list: + if isinstance(subset_split, str): + subset_split = (None, subset_split) + assert len(subset_split) == 2 + subset_name, split = subset_split + dataset = load_dataset( + dataset_id, name=subset_name, split=split) + dataset_list.append(dataset) + return concatenate_datasets(dataset_list) + + @register_dataset( DatasetName.text2sql_en, 'AI-ModelScope/texttosqlv2_25000_v2', ['train'], - tags=['chat', 'sql']) + tags=['chat', 'sql'], + hf_dataset_id='Clinton/texttosqlv2_25000_v2') @register_dataset( DatasetName.school_math_zh, - 'AI-ModelScope/school_math_0.25M', ['train'], - tags=['chat', 'math']) + 'AI-ModelScope/school_math_0.25M', + ['train'], + tags=['chat', 'math'], + hf_dataset_id='BelleGroup/school_math_0.25M') @register_dataset( DatasetName.gpt4all_en, 'wyj123456/GPT4all', ['train'], @@ -268,15 +291,18 @@ def load_ms_dataset( @register_dataset( DatasetName.code_alpaca_en, 'wyj123456/code_alpaca_en', ['train'], - tags=['chat', 'coding']) + tags=['chat', 'coding'], + hf_dataset_id='sahil2801/CodeAlpaca-20k') @register_dataset( DatasetName.finance_en, 'wyj123456/finance_en', ['train'], - tags=['chat', 'financial']) + tags=['chat', 'financial'], + hf_dataset_id='ssbuild/alpaca_finance_en') @register_dataset( DatasetName.alpaca_en, 'AI-ModelScope/alpaca-gpt4-data-en', ['train'], - tags=['chat', 'general', '🔥']) + tags=['chat', 'general', '🔥'], + hf_dataset_id='vicgalle/alpaca-gpt4') @register_dataset( DatasetName.coig_cqia_chinese_traditional, 'AI-ModelScope/COIG-CQIA', [('chinese_traditional', 'train')], @@ -344,12 +370,16 @@ def get_dataset_from_repo( preprocess_func: PreprocessFunc, remove_useless_columns: bool = True, train_dataset_sample: int = -1, - val_dataset_sample: int = -1) -> Tuple[HfDataset, Optional[HfDataset]]: + val_dataset_sample: int = -1, + use_hf: bool = False) -> Tuple[HfDataset, Optional[HfDataset]]: dataset_list = [] _iter = zip([train_subset_split_list, val_subset_split_list], [train_dataset_sample, val_dataset_sample]) for subset_split_list, dataset_sample in _iter: - dataset = load_ms_dataset(dataset_id, subset_split_list) + if use_hf: + dataset = load_hf_dataset(dataset_id, subset_split_list) + else: + dataset = load_ms_dataset(dataset_id, subset_split_list) if dataset is not None: if dataset_sample > 0 and len(dataset) > dataset_sample: random_state = np.random.RandomState(42) @@ -402,7 +432,8 @@ def _concat_inst_inp_alpaca_zh(inst: str, inp: str) -> str: None, AlpacaPreprocessor(concat_inst_inp=_concat_inst_inp_alpaca_zh), get_dataset_from_repo, - tags=['chat', 'general', '🔥']) + tags=['chat', 'general', '🔥'], + hf_dataset_id='c-s-ale/alpaca-gpt4-data-zh') def _preprocess_vision_dataset(dataset: HfDataset) -> HfDataset: @@ -559,7 +590,8 @@ def map_row(row): 'AI-ModelScope/LongAlpaca-12k', ['train'], [], long_alpaca_preprocessor, get_dataset_from_repo, - tags=['longlora', 'QA']) + tags=['longlora', 'QA'], + hf_dataset_id='Yukang/LongAlpaca-12k') def _preprocess_ruozhiba(dataset: HfDataset): @@ -647,7 +679,8 @@ def map_row(row): 'lvjianjin/AdvertiseGen', ['train'], ['validation'], TextGenerationPreprocessor(advertise_gen_prompt, 'content', 'summary'), get_dataset_from_repo, - tags=['text-generation', '🔥']) + tags=['text-generation', '🔥'], + hf_dataset_id='shibing624/AdvertiseGen') _firefly_kind_list = [ 'ProseGeneration', 'MRC', 'JinYongGeneration', 'TextCorrection', @@ -718,7 +751,8 @@ def get_firefly_zh_dataset(dataset_id: str, preprocess_func, ClsPreprocessor(['neutral', 'entailment', 'contradiction'], 'Natural Language Inference', True), get_dataset_from_repo, - tags=['text-generation', 'classification']) + tags=['text-generation', 'classification'], + hf_dataset_id='clue') register_dataset( DatasetName.cmnli_mini_zh, @@ -730,7 +764,8 @@ def get_firefly_zh_dataset(dataset_id: str, preprocess_func, 'train_dataset_sample': 20000, 'val_dataset_sample': 200 }, - tags=['text-generation', 'classification', '🔥']) + tags=['text-generation', 'classification', '🔥'], + hf_dataset_id='clue') register_dataset( DatasetName.jd_sentiment_zh, @@ -1135,7 +1170,8 @@ def _preprocess_blossom_math(dataset: HfDataset) -> HfDataset: None, _preprocess_blossom_math, get_dataset_from_repo, - tags=['chat', 'math', '🔥']) + tags=['chat', 'math', '🔥'], + hf_dataset_id='Azure99/blossom-math-v2') register_dataset( DatasetName.sql_create_context_en, @@ -1150,7 +1186,8 @@ def _preprocess_blossom_math(dataset: HfDataset) -> HfDataset: AlpacaPreprocessor(), ]), get_dataset_from_repo, - tags=['chat', 'sql', '🔥']) + tags=['chat', 'sql', '🔥'], + hf_dataset_id='b-mc2/sql-create-context') register_dataset( DatasetName.lawyer_llama_zh, @@ -1162,7 +1199,8 @@ def _preprocess_blossom_math(dataset: HfDataset) -> HfDataset: 'history': '_' }), get_dataset_from_repo, - tags=['chat', 'law']) + tags=['chat', 'law'], + hf_dataset_id='Skepsun/lawyer_llama_data') def _preprocess_tigerbot_law(dataset: HfDataset) -> HfDataset: @@ -1189,7 +1227,8 @@ def _preprocess_tigerbot_law(dataset: HfDataset) -> HfDataset: None, _preprocess_tigerbot_law, get_dataset_from_repo, - tags=['text-generation', 'law', 'pretrained']) + tags=['text-generation', 'law', 'pretrained'], + hf_dataset_id='TigerResearch/tigerbot-law-plugin') def _preprocess_leetcode_python(dataset: HfDataset) -> HfDataset: @@ -1282,7 +1321,8 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset: [[subset, 'train'] for subset in hc3_chinese_subset], [], _preprocess_hc3, get_dataset_from_repo, - tags=['text-generation', 'classification', '🔥']) + tags=['text-generation', 'classification', '🔥'], + hf_dataset_id='Hello-SimpleAI/HC3-Chinese') register_dataset( DatasetName.hc3_en, @@ -1290,40 +1330,46 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset: [], _preprocess_hc3, get_dataset_from_repo, - tags=['text-generation', 'classification', '🔥']) + tags=['text-generation', 'classification', '🔥'], + hf_dataset_id='Hello-SimpleAI/HC3') register_dataset( DatasetName.tulu_v2_sft_mixture, 'AI-ModelScope/tulu-v2-sft-mixture', ['train'], [], None, get_dataset_from_repo, - tags=['chat', 'multilingual', 'general', 'multi-round']) + tags=['chat', 'multilingual', 'general', 'multi-round'], + hf_dataset_id='allenai/tulu-v2-sft-mixture') register_dataset( DatasetName.webnovel_zh, 'AI-ModelScope/webnovel_cn', ['train'], [], None, get_dataset_from_repo, - tags=['chat', 'novel']) + tags=['chat', 'novel'], + hf_dataset_id='zxbsmk/webnovel_cn') register_dataset( DatasetName.generated_chat_zh, 'AI-ModelScope/generated_chat_0.4M', ['train'], [], None, get_dataset_from_repo, - tags=['chat', 'character-dialogue']) + tags=['chat', 'character-dialogue'], + hf_dataset_id='BelleGroup/generated_chat_0.4M') register_dataset( DatasetName.wikipedia_zh, 'AI-ModelScope/wikipedia-cn-20230720-filtered', ['train'], None, RenameColumnsPreprocessor({'completion': 'response'}), get_dataset_from_repo, - tags=['text-generation', 'general', 'pretrained']) + tags=['text-generation', 'general', 'pretrained'], + hf_dataset_id='pleisto/wikipedia-cn-20230720-filtered') register_dataset( DatasetName.open_platypus_en, 'AI-ModelScope/Open-Platypus', ['train'], None, None, get_dataset_from_repo, - tags=['chat', 'math']) + tags=['chat', 'math'], + hf_dataset_id='garage-bAInd/Open-Platypus') register_dataset( DatasetName.open_orca_gpt4, 'AI-ModelScope/OpenOrca', ['train'], @@ -1366,8 +1412,10 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset: value_key='content', error_strategy='delete'), get_dataset_from_repo, - tags=['chat', 'medical', '🔥']) + tags=['chat', 'medical', '🔥'], + hf_dataset_id='Flmc/DISC-Med-SFT') +# hf_dataset_id='ShengbinYue/DISC-Law-SFT' register_dataset( DatasetName.disc_law_sft_zh, 'AI-ModelScope/DISC-Law-SFT', ['train'], @@ -1381,13 +1429,14 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset: register_dataset( DatasetName.pileval, - 'huangjintao/pile-val-backup', ['train'], + 'huangjintao/pile-val-backup', ['validation'], None, RenameColumnsPreprocessor({ 'text': 'response', }), get_dataset_from_repo, - tags=['text-generation', 'awq']) + tags=['text-generation', 'awq'], + hf_dataset_id='mit-han-lab/pile-val-backup') def add_self_cognition_dataset( @@ -1494,7 +1543,7 @@ def get_dataset( dataset_test_ratio: float = 0., dataset_seed: Union[RandomState, int] = 42, check_dataset_strategy: Literal['none', 'discard', 'error', - 'warning'] = 'none' + 'warning'] = 'none', ) -> Tuple[HfDataset, Optional[HfDataset]]: """Returns train_dataset and val_dataset""" if isinstance(dataset_name_list, str): @@ -1506,12 +1555,21 @@ def get_dataset( random_state = RandomState(dataset_seed) for dataset_name in dataset_name_list: dataset_info = DATASET_MAPPING[dataset_name] + use_hf = strtobool(os.environ.get('USE_HF', 'False')) + if use_hf: + dataset_id_or_path = dataset_info['hf_dataset_id'] + else: + dataset_id_or_path = dataset_info['dataset_id_or_path'] + assert dataset_id_or_path is not None, ( + f'dataset_name: {dataset_name}, use_hf: {use_hf}, ' + f'dataset_id_or_path: {dataset_id_or_path}.') get_function: GetDatasetFunction = dataset_info['get_function'] dataset = get_function( - dataset_info['dataset_id_or_path'], + dataset_id_or_path, train_subset_split_list=dataset_info['train_subset_split_list'], val_subset_split_list=dataset_info['val_subset_split_list'], - preprocess_func=dataset_info['preprocess_func']) + preprocess_func=dataset_info['preprocess_func'], + use_hf=use_hf) train_d: HfDataset if isinstance(dataset, (list, tuple)): train_d, val_d = dataset @@ -1572,12 +1630,12 @@ def load_dataset_from_local( return concatenate_datasets(dataset_list) -def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]], - val_subset_split_list: Optional[Union[str, List[str]]], +def get_custom_dataset(_: str, train_dataset_path_list: Union[str, List[str]], + val_dataset_path_list: Optional[Union[str, List[str]]], preprocess_func: PreprocessFunc, **kwargs) -> Tuple[HfDataset, Optional[HfDataset]]: - train_dataset = load_dataset_from_local(train_subset_split_list, + train_dataset = load_dataset_from_local(train_dataset_path_list, preprocess_func) - val_dataset = load_dataset_from_local(val_subset_split_list, + val_dataset = load_dataset_from_local(val_dataset_path_list, preprocess_func) return train_dataset, val_dataset From 45020e62d80cbac2e2bb73b96a2f00a4f0928c6a Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 17 Apr 2024 17:10:07 +0800 Subject: [PATCH 12/26] fix lint --- README_CN.md | 2 +- swift/llm/utils/dataset.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/README_CN.md b/README_CN.md index c9fd83b98b..a6ea303910 100644 --- a/README_CN.md +++ b/README_CN.md @@ -53,7 +53,7 @@ SWIFT支持近**200种LLM和MLLM**(多模态大模型)的训练、推理、 - 🔥2024.04.02: 支持Mengzi3-13B-Base模型的推理与微调, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/mengzi3_13b_base/lora_ddp_ds/sft.sh)来开始训练! - 🔥2024.04.01: 支持**dbrx**系列, dbrx-base和dbrx-instruct, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/dbrx-instruct/lora_mp/sft.sh)来开始训练!. - 🔥2024.03.29: 支持**Qwen1.5-MoE**系列: Qwen1.5-MoE-A2.7B, Qwen1.5-MoE-A2.7B-Chat, Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4. -- 🔥2024.03.29: 支持**Grok-1**300B MoE模型的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM/Grok训练和推理.md). +- 🔥2024.03.29: 支持**Grok-1** 300B MoE模型的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM/Grok训练和推理.md). - 🔥2024.03.25: 支持TeleChat-7b和TeleChat-12b模型的训练和推理, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/telechat_12b/lora/sft.sh)来开始训练!. - 🔥2024.03.20: 支持**llava**系列的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/llava最佳实践.md). - 🔥2024.03.12: 支持**deepseek-vl**系列推理和微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/deepseek-vl最佳实践.md). diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index fcfde43c68..9860aa7e27 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -255,8 +255,7 @@ def load_hf_dataset( subset_split = (None, subset_split) assert len(subset_split) == 2 subset_name, split = subset_split - dataset = load_dataset( - dataset_id, name=subset_name, split=split) + dataset = load_dataset(dataset_id, name=subset_name, split=split) dataset_list.append(dataset) return concatenate_datasets(dataset_list) @@ -268,8 +267,7 @@ def load_hf_dataset( hf_dataset_id='Clinton/texttosqlv2_25000_v2') @register_dataset( DatasetName.school_math_zh, - 'AI-ModelScope/school_math_0.25M', - ['train'], + 'AI-ModelScope/school_math_0.25M', ['train'], tags=['chat', 'math'], hf_dataset_id='BelleGroup/school_math_0.25M') @register_dataset( From a34384e14ef130edbed4eb57b2bef6743f93b115 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 17 Apr 2024 17:31:17 +0800 Subject: [PATCH 13/26] update arguments --- swift/llm/utils/argument.py | 71 +++++-------------------------------- 1 file changed, 8 insertions(+), 63 deletions(-) diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index 416b71398c..bdf5498f56 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -222,61 +222,6 @@ def handle_compatibility( if self.deepspeed_config_path is not None: self.deepspeed = self.deepspeed_config_path - def set_model_type(self: Union[SftArguments, InferArguments]) -> None: - # compat with swift<1.7 - if args.model_cache_dir is not None and args.model_id_or_path is None: - args.model_id_or_path = args.model_cache_dir - args.model_cache_dir = None - - if args.model_id_or_path is not None: - model_mapping_reversed = { - v['model_id_or_path'].lower(): k - for k, v in MODEL_MAPPING.items() - } - model_id_or_path = args.model_id_or_path - model_id_or_path_lower = model_id_or_path.lower() - if model_id_or_path_lower not in model_mapping_reversed: - if (isinstance(args, InferArguments) - and 'checkpoint' in model_id_or_path - and 'merged' not in model_id_or_path - and args.ckpt_dir is None): - raise ValueError( - 'Please use `--ckpt_dir vx-xxx/checkpoint-xxx` to use the checkpoint.' - ) - if args.model_type is None: - raise ValueError( - f"model_id_or_path: '{model_id_or_path}' is not registered. " - 'Please set `--model_type ` additionally.') - assert args.model_cache_dir is None - else: - model_type = model_mapping_reversed[model_id_or_path_lower] - assert args.model_type is None or args.model_type == model_type - args.model_type = model_type - logger.info(f'Setting args.model_type: {model_type}') - if args.model_cache_dir is not None: - args.model_id_or_path = args.model_cache_dir - - error_msg = f'The model_type you can choose: {list(MODEL_MAPPING.keys())}' - if args.model_type is None: - raise ValueError('please setting `--model_type `. ' - + error_msg) - elif args.model_type not in MODEL_MAPPING: - raise ValueError(f"model_type: '{args.model_type}' is not registered. " - + error_msg) - model_info = MODEL_MAPPING[args.model_type] - use_hf = strtobool(os.environ.get('USE_HF', 'False')) - if args.model_revision is not None: - model_info['revision'] = args.model_revision - logger.info(f"Setting model_info['revision']: {args.model_revision}") - elif use_hf: - model_info['revision'] = 'main' - args.model_revision = model_info['revision'] - if args.model_id_or_path is None: - args.model_id_or_path = model_info['model_id_or_path'] - requires = model_info['requires'] - for require in requires: - require_version(require) - def set_model_type(self: Union['SftArguments', 'InferArguments']) -> None: # compat with swift<1.7 if self.model_cache_dir is not None and self.model_id_or_path is None: @@ -316,16 +261,16 @@ def set_model_type(self: Union['SftArguments', 'InferArguments']) -> None: raise ValueError('please setting `--model_type `. ' + error_msg) elif self.model_type not in MODEL_MAPPING: - raise ValueError( - f"model_type: '{self.model_type}' is not registered. " - + error_msg) + raise ValueError(f"model_type: '{self.model_type}' is not registered. " + + error_msg) model_info = MODEL_MAPPING[self.model_type] - if self.model_revision is None: - self.model_revision = model_info['revision'] - else: + use_hf = strtobool(os.environ.get('USE_HF', 'False')) + if self.model_revision is not None: model_info['revision'] = self.model_revision - logger.info( - f"Setting model_info['revision']: {self.model_revision}") + logger.info(f"Setting model_info['revision']: {self.model_revision}") + elif use_hf: + model_info['revision'] = 'main' + self.model_revision = model_info['revision'] if self.model_id_or_path is None: self.model_id_or_path = model_info['model_id_or_path'] requires = model_info['requires'] From b617284707832e4cb96e44e574c675e5fa929e71 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 17 Apr 2024 17:32:08 +0800 Subject: [PATCH 14/26] fix lint --- swift/llm/utils/argument.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index bdf5498f56..9fab7662bd 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -261,13 +261,15 @@ def set_model_type(self: Union['SftArguments', 'InferArguments']) -> None: raise ValueError('please setting `--model_type `. ' + error_msg) elif self.model_type not in MODEL_MAPPING: - raise ValueError(f"model_type: '{self.model_type}' is not registered. " - + error_msg) + raise ValueError( + f"model_type: '{self.model_type}' is not registered. " + + error_msg) model_info = MODEL_MAPPING[self.model_type] use_hf = strtobool(os.environ.get('USE_HF', 'False')) if self.model_revision is not None: model_info['revision'] = self.model_revision - logger.info(f"Setting model_info['revision']: {self.model_revision}") + logger.info( + f"Setting model_info['revision']: {self.model_revision}") elif use_hf: model_info['revision'] = 'main' self.model_revision = model_info['revision'] From 241d30b72781eaa6bc8747e733a4e72babe68578 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 17 Apr 2024 19:55:21 +0800 Subject: [PATCH 15/26] update scripts --- ...14\346\225\260\346\215\256\351\233\206.md" | 610 +++++++++--------- .../LLM/Supported-models-datasets.md | 610 +++++++++--------- scripts/utils/run_dataset_info.py | 18 +- scripts/utils/run_model_info.py | 19 +- swift/llm/utils/argument.py | 3 +- swift/llm/utils/dataset.py | 6 + swift/llm/utils/model.py | 8 + 7 files changed, 655 insertions(+), 619 deletions(-) diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index c695115a76..a013d6ffeb 100644 --- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -12,218 +12,220 @@ - Support VLLM: 模型是否支持[vllm](https://github.com/vllm-project/vllm)加速推理和部署. - Requires: 对应模型所需的额外依赖要求. -| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | -| --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | -|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|✔|✔||-| -|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|qwen|✔|✔||-| -|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-| -|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-| -|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|✔|✔||-| -|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|qwen|✔|✔||-| -|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-| -|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-| -|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|✔|✔||-| -|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|qwen|✔|✔||-| -|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-| -|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-| -|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|✔|✔||-| -|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|qwen|✔|✔||-| -|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-| -|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-| -|qwen1half-0_5b|[qwen/Qwen1.5-0.5B](https://modelscope.cn/models/qwen/Qwen1.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-1_8b|[qwen/Qwen1.5-1.8B](https://modelscope.cn/models/qwen/Qwen1.5-1.8B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-4b|[qwen/Qwen1.5-4B](https://modelscope.cn/models/qwen/Qwen1.5-4B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-32b|[qwen/Qwen1.5-32B](https://modelscope.cn/models/qwen/Qwen1.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|codeqwen1half-7b|[qwen/CodeQwen1.5-7B](https://modelscope.cn/models/qwen/CodeQwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-32b-chat|[qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|codeqwen1half-7b-chat|[qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-32b-chat-int4|[qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-7b-chat-awq|[qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|✔|✘||multi-modal, vision| -|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|✔|✘||multi-modal, vision| -|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|multi-modal, vision| -|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|✔|✘||multi-modal, audio| -|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|✔|✘||multi-modal, audio| -|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔||-| -|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔||-| -|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔||-| -|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|✘|✔||-| -|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|✘|✔||-| -|codegeex2-6b|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)|query_key_value|chatglm-generation|✘|✔|transformers<4.34|coding| -|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-| -|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-| -|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-| -|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-| -|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|multi-modal, vision| -|llava1d6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|✔|✘||multi-modal, vision| -|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔||-| -|yi-9b|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔||-| -|yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|q_proj, k_proj, v_proj|yi-vl|✔|✘|transformers>=4.34|multi-modal, vision| -|yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|✔|✘|transformers>=4.34|multi-modal, vision| -|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✔||-| -|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b-v1_1](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-| -|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-| -|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✔||-| -|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-| -|internlm2-1_8b|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-1_8b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-1_8b-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-7b-base|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-7b|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-7b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-7b-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-20b-base|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-20b|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-20b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-20b-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-math-7b|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)|wqkv|default-generation-bos|✔|✔||math| -|internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|✔|✔||math| -|internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation-bos|✔|✔||math| -|internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|✔|✔||math| -|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|✔|✘||multi-modal, vision| -|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-| -|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-| -|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-| -|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding| -|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding| -|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding| -|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding| -|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding| -|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding| -|deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||math| -|deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math| -|deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math| -|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|✔|✘||multi-modal, vision| -|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|✔|✘||multi-modal, vision| -|gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.38|-| -|gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.38|-| -|gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|✔|✔|transformers>=4.38|-| -|gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|✔|✔|transformers>=4.38|-| -|minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔|transformers>=4.36.0|-| -|minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔||-| -|minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔||-| -|minicpm-2b-128k|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k/summary)|q_proj, k_proj, v_proj|chatml|✔|✔|transformers>=4.36.0|-| -|minicpm-moe-8x2b|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔|transformers>=4.36.0|-| -|minicpm-v-3b-chat|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V/summary)|q_proj, k_proj, v_proj|minicpm-v|✔|✘||-| -|minicpm-v-v2|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|q_proj, k_proj, v_proj|minicpm-v|✔|✘||-| -|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-| -|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-| -|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-| -|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34|-| -|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34|-| -|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-| -|openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.36|-| -|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34|-| -|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34|-| -|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34|-| -|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34|-| -|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36|-| -|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.36|-| -|mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-| -|mixtral-moe-8x22b-v1|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36|-| -|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|✘|✔|transformers<4.34|-| -|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|✘|✔|transformers<4.34|-| -|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|✘|✔|transformers<4.34|-| -|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|✘|✔||-| -|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|✘|✔||-| -|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|✘|✘|bitsandbytes<0.41.2, accelerate<0.26|-| -|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|✘|✔||-| -|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|✘|✔||-| -|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|✘|✘|bitsandbytes<0.41.2, accelerate<0.26|-| -|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|transformers<4.35, icecream|-| -|mplug-owl2d1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|transformers<4.35, icecream|-| -|yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-| -|yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-| -|yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-| -|yuan2-102b-instruct|[YuanLLM/Yuan2.0-102B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-102B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-| -|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-| -|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-| -|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-65b-v2|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-65b-chat|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-| -|xverse-13b-256k|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-moe-a4_2b|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|orion-14b|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✘||-| -|orion-14b-chat|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)|q_proj, k_proj, v_proj|orion|✔|✘||-| -|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-| -|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-| -|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|✘|✘||-| -|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|✘|✘||-| -|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|✔|✔||-| -|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-| -|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|✘|✘||-| -|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|✔|✔|transformers>=4.34|-| -|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|✘|✘||-| -|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|✘|✔||-| -|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|✔|✔||-| -|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|✔|✔||financial| -|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|qwen|✔|✔||financial| -|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|financial| -|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|✔|✔||coding| -|codefuse-codegeex2-6b-chat|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B/summary)|query_key_value|codefuse|✘|✔|transformers<4.34|coding| -|codefuse-qwen-14b-chat|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B/summary)|c_attn|codefuse|✔|✔||coding| -|phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|✔|✔||coding| -|cogvlm-17b-instruct|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense|cogvlm-instruct|✘|✘||multi-modal, vision| -|cogagent-18b-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-chat|✘|✘||multi-modal, vision| -|cogagent-18b-instruct|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-instruct|✘|✘||multi-modal, vision| -|mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|✔|✘||-| -|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|✔|✘||-| -|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|✔|✔|transformers>=4.36|-| -|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|✔|✔|transformers>=4.36|-| -|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|✔|✔||-| -|c4ai-command-r-v01|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01/summary)|q_proj, k_proj, v_proj|c4ai|✔|✘|transformers>=4.39.1|-| -|c4ai-command-r-plus|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus/summary)|q_proj, k_proj, v_proj|c4ai|✔|✘|transformers>4.39|-| +| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID | +| --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- | +|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|✔|✔||-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)| +|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|qwen|✔|✔||-|[Qwen/Qwen-1_8B-Chat](https://huggingface.co/Qwen/Qwen-1_8B-Chat)| +|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-|[Qwen/Qwen-1_8B-Chat-Int4](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int4)| +|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-|[Qwen/Qwen-1_8B-Chat-Int8](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int8)| +|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|✔|✔||-|[Qwen/Qwen-7B](https://huggingface.co/Qwen/Qwen-7B)| +|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|qwen|✔|✔||-|[Qwen/Qwen-7B-Chat](https://huggingface.co/Qwen/Qwen-7B-Chat)| +|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-|[Qwen/Qwen-7B-Chat-Int4](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)| +|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-|[Qwen/Qwen-7B-Chat-Int8](https://huggingface.co/Qwen/Qwen-7B-Chat-Int8)| +|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|✔|✔||-|[Qwen/Qwen-14B](https://huggingface.co/Qwen/Qwen-14B)| +|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|qwen|✔|✔||-|[Qwen/Qwen-14B-Chat](https://huggingface.co/Qwen/Qwen-14B-Chat)| +|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-|[Qwen/Qwen-14B-Chat-Int4](https://huggingface.co/Qwen/Qwen-14B-Chat-Int4)| +|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-|[Qwen/Qwen-14B-Chat-Int8](https://huggingface.co/Qwen/Qwen-14B-Chat-Int8)| +|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|✔|✔||-|[Qwen/Qwen-72B](https://huggingface.co/Qwen/Qwen-72B)| +|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|qwen|✔|✔||-|[Qwen/Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat)| +|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-|[Qwen/Qwen-72B-Chat-Int4](https://huggingface.co/Qwen/Qwen-72B-Chat-Int4)| +|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-|[Qwen/Qwen-72B-Chat-Int8](https://huggingface.co/Qwen/Qwen-72B-Chat-Int8)| +|qwen1half-0_5b|[qwen/Qwen1.5-0.5B](https://modelscope.cn/models/qwen/Qwen1.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-0.5B](https://huggingface.co/Qwen/Qwen1.5-0.5B)| +|qwen1half-1_8b|[qwen/Qwen1.5-1.8B](https://modelscope.cn/models/qwen/Qwen1.5-1.8B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)| +|qwen1half-4b|[qwen/Qwen1.5-4B](https://modelscope.cn/models/qwen/Qwen1.5-4B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B)| +|qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-7B](https://huggingface.co/Qwen/Qwen1.5-7B)| +|qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-14B](https://huggingface.co/Qwen/Qwen1.5-14B)| +|qwen1half-32b|[qwen/Qwen1.5-32B](https://modelscope.cn/models/qwen/Qwen1.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-32B](https://huggingface.co/Qwen/Qwen1.5-32B)| +|qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-72B](https://huggingface.co/Qwen/Qwen1.5-72B)| +|codeqwen1half-7b|[qwen/CodeQwen1.5-7B](https://modelscope.cn/models/qwen/CodeQwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|-| +|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)| +|qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat)| +|qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat)| +|qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat](https://huggingface.co/Qwen/Qwen1.5-4B-Chat)| +|qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat)| +|qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat](https://huggingface.co/Qwen/Qwen1.5-14B-Chat)| +|qwen1half-32b-chat|[qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-32B-Chat](https://huggingface.co/Qwen/Qwen1.5-32B-Chat)| +|qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat](https://huggingface.co/Qwen/Qwen1.5-72B-Chat)| +|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)| +|codeqwen1half-7b-chat|[qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|-| +|qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4)| +|qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4)| +|qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int4)| +|qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)| +|qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4)| +|qwen1half-32b-chat-int4|[qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-GPTQ-Int4)| +|qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int4)| +|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8)| +|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8)| +|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int8)| +|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int8)| +|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int8)| +|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int8)| +|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)| +|qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-0.5B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-AWQ)| +|qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-1.8B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-AWQ)| +|qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-4B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-AWQ)| +|qwen1half-7b-chat-awq|[qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ)| +|qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-14B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-AWQ)| +|qwen1half-32b-chat-awq|[qwen/Qwen1.5-32B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-32B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-AWQ)| +|qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-72B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-AWQ)| +|codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|-| +|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|✔|✘||multi-modal, vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| +|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|✔|✘||multi-modal, vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| +|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|multi-modal, vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| +|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|✔|✘||multi-modal, audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| +|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|✔|✘||multi-modal, audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| +|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔||-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)| +|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔||-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)| +|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔||-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)| +|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|✘|✔||-|[THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b)| +|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|✘|✔||-|[THUDM/chatglm3-6b-32k](https://huggingface.co/THUDM/chatglm3-6b-32k)| +|codegeex2-6b|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)|query_key_value|chatglm-generation|✘|✔|transformers<4.34|coding|[THUDM/codegeex2-6b](https://huggingface.co/THUDM/codegeex2-6b)| +|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)| +|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)| +|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf)| +|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-|[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)| +|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)| +|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-|[meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)| +|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)| +|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)| +|llava1d6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|✔|✘||multi-modal, vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)| +|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B)| +|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)| +|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔||-|[01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat)| +|yi-9b|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-9B](https://huggingface.co/01-ai/Yi-9B)| +|yi-9b-200k|[01ai/Yi-9B-200K](https://modelscope.cn/models/01ai/Yi-9B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-9B-200K](https://huggingface.co/01-ai/Yi-9B-200K)| +|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-34B](https://huggingface.co/01-ai/Yi-34B)| +|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-34B-200K](https://huggingface.co/01-ai/Yi-34B-200K)| +|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔||-|[01-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat)| +|yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|q_proj, k_proj, v_proj|yi-vl|✔|✘|transformers>=4.34|multi-modal, vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)| +|yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|✔|✘|transformers>=4.34|multi-modal, vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)| +|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✔||-|[internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b)| +|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-|[internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)| +|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-|-| +|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✔||-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)| +|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)| +|internlm2-1_8b|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b)| +|internlm2-1_8b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-1_8b-sft](https://huggingface.co/internlm/internlm2-chat-1_8b-sft)| +|internlm2-1_8b-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-1_8b](https://huggingface.co/internlm/internlm2-chat-1_8b)| +|internlm2-7b-base|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-base-7b](https://huggingface.co/internlm/internlm2-base-7b)| +|internlm2-7b|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-7b](https://huggingface.co/internlm/internlm2-7b)| +|internlm2-7b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-7b-sft](https://huggingface.co/internlm/internlm2-chat-7b-sft)| +|internlm2-7b-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b)| +|internlm2-20b-base|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-base-20b](https://huggingface.co/internlm/internlm2-base-20b)| +|internlm2-20b|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)| +|internlm2-20b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-20b-sft](https://huggingface.co/internlm/internlm2-chat-20b-sft)| +|internlm2-20b-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)| +|internlm2-math-7b|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)|wqkv|default-generation-bos|✔|✔||math|[internlm/internlm2-math-base-7b](https://huggingface.co/internlm/internlm2-math-base-7b)| +|internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|✔|✔||math|[internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b)| +|internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation-bos|✔|✔||math|[internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b)| +|internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|✔|✔||math|[internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b)| +|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|✔|✘||multi-modal, vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)| +|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[deepseek-ai/deepseek-llm-7b-base](https://huggingface.co/deepseek-ai/deepseek-llm-7b-base)| +|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-|[deepseek-ai/deepseek-llm-7b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)| +|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[deepseek-ai/deepseek-moe-16b-base](https://huggingface.co/deepseek-ai/deepseek-moe-16b-base)| +|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-|[deepseek-ai/deepseek-moe-16b-chat](https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat)| +|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[deepseek-ai/deepseek-llm-67b-base](https://huggingface.co/deepseek-ai/deepseek-llm-67b-base)| +|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-|[deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat)| +|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding|[deepseek-ai/deepseek-coder-1.3b-base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base)| +|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-1.3b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-instruct)| +|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding|[deepseek-ai/deepseek-coder-6.7b-base](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base)| +|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)| +|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)| +|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)| +|deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)| +|deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)| +|deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)| +|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|✔|✘||multi-modal, vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| +|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|✔|✘||multi-modal, vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| +|gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.38|-|[google/gemma-2b](https://huggingface.co/google/gemma-2b)| +|gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.38|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)| +|gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|✔|✔|transformers>=4.38|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)| +|gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|✔|✔|transformers>=4.38|-|[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)| +|minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔|transformers>=4.36.0|-|[openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)| +|minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔||-|[openbmb/MiniCPM-2B-sft-fp32](https://huggingface.co/openbmb/MiniCPM-2B-sft-fp32)| +|minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔||-|[openbmb/MiniCPM-2B-dpo-fp32](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)| +|minicpm-2b-128k|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k/summary)|q_proj, k_proj, v_proj|chatml|✔|✔|transformers>=4.36.0|-|[openbmb/MiniCPM-2B-128k](https://huggingface.co/openbmb/MiniCPM-2B-128k)| +|minicpm-moe-8x2b|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔|transformers>=4.36.0|-|[openbmb/MiniCPM-MoE-8x2B](https://huggingface.co/openbmb/MiniCPM-MoE-8x2B)| +|minicpm-v-3b-chat|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V/summary)|q_proj, k_proj, v_proj|minicpm-v|✔|✘||-|[openbmb/MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V)| +|minicpm-v-v2|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|q_proj, k_proj, v_proj|minicpm-v|✔|✘||-|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)| +|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://huggingface.co/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)| +|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama-65b-v8-bf16)| +|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)| +|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34|-|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mistral-7b-v17.1-32k)| +|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34|-|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://huggingface.co/OpenBuddy/openbuddy-zephyr-7b-v14.1)| +|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://huggingface.co/OpenBuddy/openbuddy-deepseek-67b-v15.2)| +|openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.36|-|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k)| +|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34|-|[mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)| +|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34|-|[alpindale/Mistral-7B-v0.2-hf](https://huggingface.co/alpindale/Mistral-7B-v0.2-hf)| +|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34|-|[mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| +|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34|-|[mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)| +|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36|-|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)| +|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.36|-|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| +|mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)| +|mixtral-moe-8x22b-v1|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36|-|[mistral-community/Mixtral-8x22B-v0.1](https://huggingface.co/mistral-community/Mixtral-8x22B-v0.1)| +|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|✘|✔|transformers<4.34|-|[baichuan-inc/Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B)| +|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|✘|✔|transformers<4.34|-|[baichuan-inc/Baichuan-13B-Base](https://huggingface.co/baichuan-inc/Baichuan-13B-Base)| +|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|✘|✔|transformers<4.34|-|[baichuan-inc/Baichuan-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat)| +|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|✘|✔||-|[baichuan-inc/Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)| +|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|✘|✔||-|[baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)| +|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|✘|✘|bitsandbytes<0.41.2, accelerate<0.26|-|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat-4bits)| +|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|✘|✔||-|[baichuan-inc/Baichuan2-13B-Base](https://huggingface.co/baichuan-inc/Baichuan2-13B-Base)| +|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|✘|✔||-|[baichuan-inc/Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat)| +|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|✘|✘|bitsandbytes<0.41.2, accelerate<0.26|-|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat-4bits)| +|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|transformers<4.35, icecream|-|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)| +|mplug-owl2d1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|transformers<4.35, icecream|-|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)| +|yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)| +|yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-|[IEITYuan/Yuan2-2B-Janus-hf](https://huggingface.co/IEITYuan/Yuan2-2B-Janus-hf)| +|yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)| +|yuan2-102b-instruct|[YuanLLM/Yuan2.0-102B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-102B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)| +|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-7B](https://huggingface.co/xverse/XVERSE-7B)| +|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-|[xverse/XVERSE-7B-Chat](https://huggingface.co/xverse/XVERSE-7B-Chat)| +|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-13B](https://huggingface.co/xverse/XVERSE-13B)| +|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-|[xverse/XVERSE-13B-Chat](https://huggingface.co/xverse/XVERSE-13B-Chat)| +|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-65B](https://huggingface.co/xverse/XVERSE-65B)| +|xverse-65b-v2|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-65B-2](https://huggingface.co/xverse/XVERSE-65B-2)| +|xverse-65b-chat|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-|[xverse/XVERSE-65B-Chat](https://huggingface.co/xverse/XVERSE-65B-Chat)| +|xverse-13b-256k|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-13B-256K](https://huggingface.co/xverse/XVERSE-13B-256K)| +|xverse-moe-a4_2b|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-MoE-A4.2B](https://huggingface.co/xverse/XVERSE-MoE-A4.2B)| +|orion-14b|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✘||-|[OrionStarAI/Orion-14B-Base](https://huggingface.co/OrionStarAI/Orion-14B-Base)| +|orion-14b-chat|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)|q_proj, k_proj, v_proj|orion|✔|✘||-|[OrionStarAI/Orion-14B-Chat](https://huggingface.co/OrionStarAI/Orion-14B-Chat)| +|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-|[vivo-ai/BlueLM-7B-Base](https://huggingface.co/vivo-ai/BlueLM-7B-Base)| +|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-|[vivo-ai/BlueLM-7B-Base-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Base-32K)| +|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|✘|✘||-|[vivo-ai/BlueLM-7B-Chat](https://huggingface.co/vivo-ai/BlueLM-7B-Chat)| +|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|✘|✘||-|[vivo-ai/BlueLM-7B-Chat-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Chat-32K)| +|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[IDEA-CCNL/Ziya2-13B-Base](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Base)| +|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|✔|✔||-|[IDEA-CCNL/Ziya2-13B-Chat](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Chat)| +|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-|[Skywork/Skywork-13B-base](https://huggingface.co/Skywork/Skywork-13B-base)| +|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|✘|✘||-|-| +|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|✔|✔|transformers>=4.34|-|[HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)| +|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|✘|✘||-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)| +|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|✘|✔||-|[DAMO-NLP/SeqGPT-560M](https://huggingface.co/DAMO-NLP/SeqGPT-560M)| +|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|✔|✔||-|[SUSTech/SUS-Chat-34B](https://huggingface.co/SUSTech/SUS-Chat-34B)| +|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|✔|✔||financial|-| +|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|qwen|✔|✔||financial|[jxy/Tongyi-Finance-14B-Chat](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat)| +|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|financial|[jxy/Tongyi-Finance-14B-Chat-Int4](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat-Int4)| +|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|✔|✔||coding|[codefuse-ai/CodeFuse-CodeLlama-34B](https://huggingface.co/codefuse-ai/CodeFuse-CodeLlama-34B)| +|codefuse-codegeex2-6b-chat|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B/summary)|query_key_value|codefuse|✘|✔|transformers<4.34|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)| +|codefuse-qwen-14b-chat|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B/summary)|c_attn|codefuse|✔|✔||coding|[codefuse-ai/CodeFuse-QWen-14B](https://huggingface.co/codefuse-ai/CodeFuse-QWen-14B)| +|phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|✔|✔||coding|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)| +|cogvlm-17b-instruct|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense|cogvlm-instruct|✘|✘||multi-modal, vision|[THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)| +|cogagent-18b-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-chat|✘|✘||multi-modal, vision|[THUDM/cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)| +|cogagent-18b-instruct|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-instruct|✘|✘||multi-modal, vision|[THUDM/cogagent-vqa-hf](https://huggingface.co/THUDM/cogagent-vqa-hf)| +|mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)| +|mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)| +|mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)| +|mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-790m-hf](https://huggingface.co/state-spaces/mamba-790m-hf)| +|mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)| +|mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)| +|telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|✔|✘||-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)| +|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|✔|✘||-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)| +|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)| +|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|✔|✔|transformers>=4.36|-|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)| +|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|✔|✔|transformers>=4.36|-|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)| +|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|✔|✔||-|[Langboat/Mengzi3-13B-Base](https://huggingface.co/Langboat/Mengzi3-13B-Base)| +|c4ai-command-r-v01|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01/summary)|q_proj, k_proj, v_proj|c4ai|✔|✘|transformers>=4.39.1|-|[CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)| +|c4ai-command-r-plus|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus/summary)|q_proj, k_proj, v_proj|c4ai|✔|✘|transformers>4.39|-|[CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| ## 数据集 @@ -233,95 +235,95 @@ - Size: 数据集中的数据样本数量. - Statistic: 数据集的统计量. 我们使用token数进行统计, 这对于调整`max_length`超参数有帮助. 我们将数据集的训练集和验证集进行拼接, 然后进行统计. 我们使用qwen的tokenizer对数据集进行分词. 不同的tokenizer的统计量不同, 如果你要获取其他的模型的tokenizer的token统计量, 可以通过[脚本](https://github.com/modelscope/swift/tree/main/scripts/utils/run_dataset_info.py)自行获取. -| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags | -| ------------ | ---------- | ---------- | -------- | ----------------- | ---- | -|🔥ms-bench|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|316228|0|345.0±441.3, min=22, max=30960|chat, general, multi-round| -|🔥ms-bench-mini|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|19492|0|353.9±439.4, min=29, max=12078|chat, general, multi-round| -|🔥alpaca-en|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)|52002|0|176.2±125.8, min=26, max=740|chat, general| -|🔥alpaca-zh|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)|48818|0|162.1±93.9, min=26, max=856|chat, general| -|multi-alpaca-all|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary)|131867|0|112.9±50.6, min=26, max=1226|chat, general, multilingual| -|instinwild-en|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|52191|0|160.2±69.7, min=33, max=763|chat, general| -|instinwild-zh|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|51504|0|130.3±45.1, min=28, max=1434|chat, general| -|cot-en|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT/summary)|74771|0|122.7±64.8, min=51, max=8320|chat, general| -|cot-zh|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh/summary)|74771|0|117.5±70.8, min=43, max=9636|chat, general| -|firefly-all-zh|[wyj123456/firefly](https://modelscope.cn/datasets/wyj123456/firefly/summary)|1649399|0|178.1±260.4, min=26, max=12516|chat, general| -|instruct-en|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct/summary)|888970|0|268.9±331.2, min=26, max=7252|chat, general| -|gpt4all-en|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all/summary)|806199|0|302.5±384.1, min=27, max=7391|chat, general| -|sharegpt-en|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|99799|0|1045.7±431.9, min=22, max=7907|chat, general, multi-round| -|sharegpt-zh|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|135399|0|806.3±771.7, min=21, max=65318|chat, general, multi-round| -|tulu-v2-sft-mixture|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary)|326154|0|867.8±996.4, min=22, max=12111|chat, multilingual, general, multi-round| -|wikipedia-zh|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary)|254547|0|568.4±713.2, min=37, max=78678|text-generation, general, pretrained| -|open-orca|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|3239027|0|360.4±402.9, min=27, max=8672|chat, multilingual, general| -|open-orca-gpt4|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|994896|0|382.3±417.4, min=31, max=8740|chat, multilingual, general| -|sharegpt-gpt4|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|103063|0|1286.2±2089.4, min=22, max=221080|chat, multilingual, general, multi-round| -|🔥sharegpt-gpt4-mini|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|6205|0|3511.6±6068.5, min=33, max=116018|chat, multilingual, general, multi-round, gpt4| -|🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)|30000|0|647.7±217.1, min=199, max=2722|chat, agent, multi-round| -|ms-agent-for-agentfabric-default|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|30000|0|617.8±199.1, min=251, max=2657|chat, agent, multi-round| -|ms-agent-for-agentfabric-addition|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|488|0|2084.9±1514.8, min=489, max=7354|chat, agent, multi-round| -|damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|422115|161|965.7±440.9, min=321, max=31535|chat, agent, multi-round| -|damo-agent-mini-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|39964|152|1230.9±350.1, min=558, max=4982|chat, agent, multi-round| -|agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|1866|0|1144.3±635.5, min=206, max=6412|chat, agent, multi-round| -|code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)|20016|0|100.1±60.1, min=29, max=1776|chat, coding| -|🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)|2359|0|723.8±233.5, min=259, max=2117|chat, coding| -|🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)|27224|0|483.6±193.9, min=45, max=3082|chat, coding| -|🔥codefuse-evol-instruction-zh|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)|66862|0|439.6±206.3, min=37, max=2983|chat, coding| -|medical-en|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|117117|500|257.4±89.1, min=36, max=2564|chat, medical| -|medical-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|1950472|500|167.2±219.7, min=26, max=27351|chat, medical| -|medical-mini-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|50000|500|168.1±220.8, min=26, max=12320|chat, medical| -|🔥disc-med-sft-zh|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT/summary)|441767|0|354.1±193.1, min=25, max=2231|chat, medical| -|lawyer-llama-zh|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary)|21476|0|194.4±91.7, min=27, max=924|chat, law| -|tigerbot-law-zh|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)|55895|0|109.9±126.4, min=37, max=18878|text-generation, law, pretrained| -|🔥disc-law-sft-zh|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT/summary)|166758|0|533.7±495.4, min=30, max=15169|chat, law| -|🔥blossom-math-zh|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary)|10000|0|169.3±58.7, min=35, max=563|chat, math| -|school-math-zh|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)|248480|0|157.6±72.1, min=33, max=3450|chat, math| -|open-platypus-en|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus/summary)|24926|0|367.9±254.8, min=30, max=3951|chat, math| -|text2sql-en|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary)|25000|0|274.6±326.4, min=38, max=1975|chat, sql| -|🔥sql-create-context-en|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)|78577|0|80.2±17.8, min=36, max=456|chat, sql| -|🔥advertise-gen-zh|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)|97484|915|131.6±21.7, min=52, max=242|text-generation| -|🔥dureader-robust-zh|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)|15937|1962|242.1±137.4, min=61, max=1417|text-generation| -|cmnli-zh|[clue](https://modelscope.cn/datasets/clue/summary)|391783|12241|83.6±16.6, min=52, max=200|text-generation, classification| -|🔥cmnli-mini-zh|[clue](https://modelscope.cn/datasets/clue/summary)|20000|200|82.9±16.3, min=52, max=188|text-generation, classification| -|🔥jd-sentiment-zh|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)|45012|4988|67.0±83.2, min=40, max=4040|text-generation, classification| -|🔥hc3-zh|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese/summary)|39781|0|177.8±81.5, min=58, max=3052|text-generation, classification| -|🔥hc3-en|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3/summary)|11021|0|299.3±138.7, min=66, max=2268|text-generation, classification| -|finance-en|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en/summary)|68911|0|135.6±134.3, min=26, max=3525|chat, financial| -|poetry-zh|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary)|388599|1710|55.2±9.4, min=23, max=83|text-generation, poetry| -|webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)|50000|0|1478.9±11526.1, min=100, max=490484|chat, novel| -|generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)|396004|0|273.3±52.0, min=32, max=873|chat, character-dialogue| -|cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)|4959|0|3234.4±2547.5, min=91, max=19548|chat, classification| -|ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner| -|long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)|11998|0|9619.0±8295.8, min=36, max=78925|longlora, QA| -|coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision| -|🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision| -|🔥coco-mini-en-2|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|36.8±2.8, min=32, max=77|chat, multi-modal, vision| -|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision| -|aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio| -|🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio| -|hh-rlhf-harmless-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42462|2308|167.2±123.1, min=22, max=986|rlhf, dpo, pairwise| -|hh-rlhf-helpful-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|43777|2348|201.9±135.2, min=25, max=1070|rlhf, dpo, pairwise| -|hh-rlhf-helpful-online|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|10150|1137|401.5±278.7, min=32, max=1987|rlhf, dpo, pairwise| -|hh-rlhf-helpful-rejection-sampled|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise| -|hh-rlhf-red-team-attempts|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise| -|🔥hh-rlhf-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|172085|9292|172.8±124.0, min=22, max=1638|rlhf, dpo, pairwise| -|hh-rlhf-cn-harmless-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|143.9±109.4, min=24, max=3078|rlhf, dpo, pairwise| -|hh-rlhf-cn-helpful-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|176.8±120.0, min=26, max=1420|rlhf, dpo, pairwise| -|hh-rlhf-cn-harmless-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|167.5±123.2, min=22, max=986|rlhf, dpo, pairwise| -|hh-rlhf-cn-helpful-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|202.2±135.3, min=25, max=1070|rlhf, dpo, pairwise| -|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise| -|pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)|214670|0|1612.3±8856.2, min=11, max=1208955|text-generation, awq| -|🔥coig-cqia-chinese-traditional|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1111|0|172.6±59.9, min=55, max=856|general| -|🔥coig-cqia-coig-pc|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3000|0|353.5±859.6, min=34, max=19288|general| -|🔥coig-cqia-exam|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|4856|0|275.0±240.0, min=45, max=4932|general| -|🔥coig-cqia-finance|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|11288|0|1266.4±561.1, min=60, max=10582|general| -|🔥coig-cqia-douban|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3086|0|402.9±544.7, min=88, max=10870|general| -|🔥coig-cqia-human-value|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1007|0|151.2±77.3, min=39, max=656|general| -|🔥coig-cqia-logi-qa|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|421|0|309.8±188.8, min=43, max=1306|general| -|🔥coig-cqia-ruozhiba|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|240|0|189.8±62.2, min=33, max=505|general| -|🔥coig-cqia-segmentfault|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|458|0|449.0±495.8, min=87, max=6342|general| -|🔥coig-cqia-wiki|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|10603|0|619.2±515.8, min=73, max=10140|general| -|🔥coig-cqia-wikihow|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1485|0|1700.0±790.9, min=260, max=6371|general| -|🔥coig-cqia-xhs|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1508|0|438.0±179.6, min=129, max=2191|general| -|🔥coig-cqia-zhihu|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|5631|0|540.7±306.7, min=161, max=3036|general| -|🔥ruozhiba-post-annual|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|1361|0|36.6±15.3, min=24, max=559|pretrain| -|🔥ruozhiba-title-good|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|2597|0|41.9±19.3, min=22, max=246|pretrain| -|🔥ruozhiba-title-norm|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|81700|0|39.9±12.8, min=21, max=386|pretrain| +| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags | HF Dataset ID | +| ------------ | ---------- | ---------- | -------- | ----------------- | ---- | ------------- | +|🔥ms-bench|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|316228|0|345.0±441.3, min=22, max=30960|chat, general, multi-round|-| +|🔥ms-bench-mini|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|19492|0|353.9±439.4, min=29, max=12078|chat, general, multi-round|-| +|🔥alpaca-en|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)|52002|0|176.2±125.8, min=26, max=740|chat, general|[vicgalle/alpaca-gpt4](https://huggingface.co/datasets/vicgalle/alpaca-gpt4)| +|🔥alpaca-zh|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)|48818|0|162.1±93.9, min=26, max=856|chat, general|[c-s-ale/alpaca-gpt4-data-zh](https://huggingface.co/datasets/c-s-ale/alpaca-gpt4-data-zh)| +|multi-alpaca-all|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary)|131867|0|112.9±50.6, min=26, max=1226|chat, general, multilingual|-| +|instinwild-en|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|52191|0|160.2±69.7, min=33, max=763|chat, general|-| +|instinwild-zh|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|51504|0|130.3±45.1, min=28, max=1434|chat, general|-| +|cot-en|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT/summary)|74771|0|122.7±64.8, min=51, max=8320|chat, general|-| +|cot-zh|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh/summary)|74771|0|117.5±70.8, min=43, max=9636|chat, general|-| +|firefly-all-zh|[wyj123456/firefly](https://modelscope.cn/datasets/wyj123456/firefly/summary)|1649399|0|178.1±260.4, min=26, max=12516|chat, general|-| +|instruct-en|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct/summary)|888970|0|268.9±331.2, min=26, max=7252|chat, general|-| +|gpt4all-en|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all/summary)|806199|0|302.5±384.1, min=27, max=7391|chat, general|-| +|sharegpt-en|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|99799|0|1045.7±431.9, min=22, max=7907|chat, general, multi-round|-| +|sharegpt-zh|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|135399|0|806.3±771.7, min=21, max=65318|chat, general, multi-round|-| +|tulu-v2-sft-mixture|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary)|326154|0|867.8±996.4, min=22, max=12111|chat, multilingual, general, multi-round|[allenai/tulu-v2-sft-mixture](https://huggingface.co/datasets/allenai/tulu-v2-sft-mixture)| +|wikipedia-zh|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary)|254547|0|568.4±713.2, min=37, max=78678|text-generation, general, pretrained|[pleisto/wikipedia-cn-20230720-filtered](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)| +|open-orca|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|3239027|0|360.4±402.9, min=27, max=8672|chat, multilingual, general|-| +|open-orca-gpt4|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|994896|0|382.3±417.4, min=31, max=8740|chat, multilingual, general|-| +|sharegpt-gpt4|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|103063|0|1286.2±2089.4, min=22, max=221080|chat, multilingual, general, multi-round|-| +|🔥sharegpt-gpt4-mini|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|6205|0|3511.6±6068.5, min=33, max=116018|chat, multilingual, general, multi-round, gpt4|-| +|🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)|30000|0|647.7±217.1, min=199, max=2722|chat, agent, multi-round|-| +|ms-agent-for-agentfabric-default|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|30000|0|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-| +|ms-agent-for-agentfabric-addition|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|488|0|2084.9±1514.8, min=489, max=7354|chat, agent, multi-round|-| +|damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|422115|161|965.7±440.9, min=321, max=31535|chat, agent, multi-round|-| +|damo-agent-mini-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|39964|152|1230.9±350.1, min=558, max=4982|chat, agent, multi-round|-| +|agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|1866|0|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-| +|code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)|20016|0|100.1±60.1, min=29, max=1776|chat, coding|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)| +|🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)|2359|0|723.8±233.5, min=259, max=2117|chat, coding|-| +|🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)|27224|0|483.6±193.9, min=45, max=3082|chat, coding|-| +|🔥codefuse-evol-instruction-zh|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)|66862|0|439.6±206.3, min=37, max=2983|chat, coding|-| +|medical-en|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|117117|500|257.4±89.1, min=36, max=2564|chat, medical|-| +|medical-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|1950472|500|167.2±219.7, min=26, max=27351|chat, medical|-| +|medical-mini-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|50000|500|168.1±220.8, min=26, max=12320|chat, medical|-| +|🔥disc-med-sft-zh|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT/summary)|441767|0|354.1±193.1, min=25, max=2231|chat, medical|[Flmc/DISC-Med-SFT](https://huggingface.co/datasets/Flmc/DISC-Med-SFT)| +|lawyer-llama-zh|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary)|21476|0|194.4±91.7, min=27, max=924|chat, law|[Skepsun/lawyer_llama_data](https://huggingface.co/datasets/Skepsun/lawyer_llama_data)| +|tigerbot-law-zh|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)|55895|0|109.9±126.4, min=37, max=18878|text-generation, law, pretrained|[TigerResearch/tigerbot-law-plugin](https://huggingface.co/datasets/TigerResearch/tigerbot-law-plugin)| +|🔥disc-law-sft-zh|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT/summary)|166758|0|533.7±495.4, min=30, max=15169|chat, law|-| +|🔥blossom-math-zh|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary)|10000|0|169.3±58.7, min=35, max=563|chat, math|[Azure99/blossom-math-v2](https://huggingface.co/datasets/Azure99/blossom-math-v2)| +|school-math-zh|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)|248480|0|157.6±72.1, min=33, max=3450|chat, math|[BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)| +|open-platypus-en|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus/summary)|24926|0|367.9±254.8, min=30, max=3951|chat, math|[garage-bAInd/Open-Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)| +|text2sql-en|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary)|25000|0|274.6±326.4, min=38, max=1975|chat, sql|[Clinton/texttosqlv2_25000_v2](https://huggingface.co/datasets/Clinton/texttosqlv2_25000_v2)| +|🔥sql-create-context-en|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)|78577|0|80.2±17.8, min=36, max=456|chat, sql|[b-mc2/sql-create-context](https://huggingface.co/datasets/b-mc2/sql-create-context)| +|🔥advertise-gen-zh|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)|97484|915|131.6±21.7, min=52, max=242|text-generation|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)| +|🔥dureader-robust-zh|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)|15937|1962|242.1±137.4, min=61, max=1417|text-generation|-| +|cmnli-zh|[clue](https://modelscope.cn/datasets/clue/summary)|391783|12241|83.6±16.6, min=52, max=200|text-generation, classification|[clue](https://huggingface.co/datasets/clue)| +|🔥cmnli-mini-zh|[clue](https://modelscope.cn/datasets/clue/summary)|20000|200|82.9±16.3, min=52, max=188|text-generation, classification|[clue](https://huggingface.co/datasets/clue)| +|🔥jd-sentiment-zh|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)|45012|4988|67.0±83.2, min=40, max=4040|text-generation, classification|-| +|🔥hc3-zh|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese/summary)|39781|0|177.8±81.5, min=58, max=3052|text-generation, classification|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)| +|🔥hc3-en|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3/summary)|11021|0|299.3±138.7, min=66, max=2268|text-generation, classification|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)| +|finance-en|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en/summary)|68911|0|135.6±134.3, min=26, max=3525|chat, financial|[ssbuild/alpaca_finance_en](https://huggingface.co/datasets/ssbuild/alpaca_finance_en)| +|poetry-zh|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary)|388599|1710|55.2±9.4, min=23, max=83|text-generation, poetry|-| +|webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)|50000|0|1478.9±11526.1, min=100, max=490484|chat, novel|[zxbsmk/webnovel_cn](https://huggingface.co/datasets/zxbsmk/webnovel_cn)| +|generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)|396004|0|273.3±52.0, min=32, max=873|chat, character-dialogue|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)| +|cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)|4959|0|3234.4±2547.5, min=91, max=19548|chat, classification|-| +|ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner|-| +|long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)|11998|0|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)| +|coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision|-| +|🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision|-| +|🔥coco-mini-en-2|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|36.8±2.8, min=32, max=77|chat, multi-modal, vision|-| +|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision|-| +|aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio|-| +|🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio|-| +|hh-rlhf-harmless-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42462|2308|167.2±123.1, min=22, max=986|rlhf, dpo, pairwise|-| +|hh-rlhf-helpful-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|43777|2348|201.9±135.2, min=25, max=1070|rlhf, dpo, pairwise|-| +|hh-rlhf-helpful-online|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|10150|1137|401.5±278.7, min=32, max=1987|rlhf, dpo, pairwise|-| +|hh-rlhf-helpful-rejection-sampled|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|-| +|hh-rlhf-red-team-attempts|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|-| +|🔥hh-rlhf-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|172085|9292|172.8±124.0, min=22, max=1638|rlhf, dpo, pairwise|-| +|hh-rlhf-cn-harmless-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|143.9±109.4, min=24, max=3078|rlhf, dpo, pairwise|-| +|hh-rlhf-cn-helpful-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|176.8±120.0, min=26, max=1420|rlhf, dpo, pairwise|-| +|hh-rlhf-cn-harmless-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|167.5±123.2, min=22, max=986|rlhf, dpo, pairwise|-| +|hh-rlhf-cn-helpful-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|202.2±135.3, min=25, max=1070|rlhf, dpo, pairwise|-| +|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|-| +|pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)|214670|0|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)| +|🔥coig-cqia-chinese-traditional|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1111|0|172.6±59.9, min=55, max=856|general|-| +|🔥coig-cqia-coig-pc|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3000|0|353.5±859.6, min=34, max=19288|general|-| +|🔥coig-cqia-exam|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|4856|0|275.0±240.0, min=45, max=4932|general|-| +|🔥coig-cqia-finance|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|11288|0|1266.4±561.1, min=60, max=10582|general|-| +|🔥coig-cqia-douban|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3086|0|402.9±544.7, min=88, max=10870|general|-| +|🔥coig-cqia-human-value|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1007|0|151.2±77.3, min=39, max=656|general|-| +|🔥coig-cqia-logi-qa|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|421|0|309.8±188.8, min=43, max=1306|general|-| +|🔥coig-cqia-ruozhiba|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|240|0|189.8±62.2, min=33, max=505|general|-| +|🔥coig-cqia-segmentfault|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|458|0|449.0±495.8, min=87, max=6342|general|-| +|🔥coig-cqia-wiki|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|10603|0|619.2±515.8, min=73, max=10140|general|-| +|🔥coig-cqia-wikihow|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1485|0|1700.0±790.9, min=260, max=6371|general|-| +|🔥coig-cqia-xhs|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1508|0|438.0±179.6, min=129, max=2191|general|-| +|🔥coig-cqia-zhihu|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|5631|0|540.7±306.7, min=161, max=3036|general|-| +|🔥ruozhiba-post-annual|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|1361|0|36.6±15.3, min=24, max=559|pretrain|-| +|🔥ruozhiba-title-good|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|2597|0|41.9±19.3, min=22, max=246|pretrain|-| +|🔥ruozhiba-title-norm|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|81700|0|39.9±12.8, min=21, max=386|pretrain|-| diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md index 2737c214ec..414c714d9d 100644 --- a/docs/source_en/LLM/Supported-models-datasets.md +++ b/docs/source_en/LLM/Supported-models-datasets.md @@ -12,218 +12,220 @@ The table below introcudes all models supported by SWIFT: - Support VLLM: Whether the model supports [vllm](https://github.com/vllm-project/vllm) to accelerate infer and deployment. - Requires: The extra requirements used by the model. -| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | -| --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | -|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|✔|✔||-| -|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|qwen|✔|✔||-| -|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-| -|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-| -|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|✔|✔||-| -|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|qwen|✔|✔||-| -|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-| -|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-| -|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|✔|✔||-| -|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|qwen|✔|✔||-| -|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-| -|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-| -|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|✔|✔||-| -|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|qwen|✔|✔||-| -|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-| -|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-| -|qwen1half-0_5b|[qwen/Qwen1.5-0.5B](https://modelscope.cn/models/qwen/Qwen1.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-1_8b|[qwen/Qwen1.5-1.8B](https://modelscope.cn/models/qwen/Qwen1.5-1.8B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-4b|[qwen/Qwen1.5-4B](https://modelscope.cn/models/qwen/Qwen1.5-4B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-32b|[qwen/Qwen1.5-32B](https://modelscope.cn/models/qwen/Qwen1.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|codeqwen1half-7b|[qwen/CodeQwen1.5-7B](https://modelscope.cn/models/qwen/CodeQwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-| -|qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-32b-chat|[qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|codeqwen1half-7b-chat|[qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-| -|qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-32b-chat-int4|[qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-| -|qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-7b-chat-awq|[qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-| -|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|✔|✘||multi-modal, vision| -|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|✔|✘||multi-modal, vision| -|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|multi-modal, vision| -|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|✔|✘||multi-modal, audio| -|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|✔|✘||multi-modal, audio| -|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔||-| -|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔||-| -|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔||-| -|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|✘|✔||-| -|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|✘|✔||-| -|codegeex2-6b|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)|query_key_value|chatglm-generation|✘|✔|transformers<4.34|coding| -|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-| -|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-| -|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-| -|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-| -|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|multi-modal, vision| -|llava1d6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|✔|✘||multi-modal, vision| -|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔||-| -|yi-9b|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-| -|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔||-| -|yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|q_proj, k_proj, v_proj|yi-vl|✔|✘|transformers>=4.34|multi-modal, vision| -|yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|✔|✘|transformers>=4.34|multi-modal, vision| -|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✔||-| -|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b-v1_1](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-| -|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-| -|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✔||-| -|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-| -|internlm2-1_8b|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-1_8b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-1_8b-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-7b-base|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-7b|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-7b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-7b-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-20b-base|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-20b|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b/summary)|wqkv|default-generation-bos|✔|✔||-| -|internlm2-20b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-20b-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b/summary)|wqkv|internlm2|✔|✔||-| -|internlm2-math-7b|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)|wqkv|default-generation-bos|✔|✔||math| -|internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|✔|✔||math| -|internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation-bos|✔|✔||math| -|internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|✔|✔||math| -|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|✔|✘||multi-modal, vision| -|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-| -|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-| -|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-| -|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding| -|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding| -|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding| -|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding| -|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding| -|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding| -|deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||math| -|deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math| -|deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math| -|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|✔|✘||multi-modal, vision| -|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|✔|✘||multi-modal, vision| -|gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.38|-| -|gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.38|-| -|gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|✔|✔|transformers>=4.38|-| -|gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|✔|✔|transformers>=4.38|-| -|minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔|transformers>=4.36.0|-| -|minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔||-| -|minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔||-| -|minicpm-2b-128k|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k/summary)|q_proj, k_proj, v_proj|chatml|✔|✔|transformers>=4.36.0|-| -|minicpm-moe-8x2b|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔|transformers>=4.36.0|-| -|minicpm-v-3b-chat|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V/summary)|q_proj, k_proj, v_proj|minicpm-v|✔|✘||-| -|minicpm-v-v2|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|q_proj, k_proj, v_proj|minicpm-v|✔|✘||-| -|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-| -|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-| -|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-| -|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34|-| -|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34|-| -|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-| -|openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.36|-| -|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34|-| -|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34|-| -|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34|-| -|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34|-| -|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36|-| -|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.36|-| -|mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-| -|mixtral-moe-8x22b-v1|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36|-| -|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|✘|✔|transformers<4.34|-| -|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|✘|✔|transformers<4.34|-| -|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|✘|✔|transformers<4.34|-| -|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|✘|✔||-| -|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|✘|✔||-| -|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|✘|✘|bitsandbytes<0.41.2, accelerate<0.26|-| -|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|✘|✔||-| -|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|✘|✔||-| -|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|✘|✘|bitsandbytes<0.41.2, accelerate<0.26|-| -|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|transformers<4.35, icecream|-| -|mplug-owl2d1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|transformers<4.35, icecream|-| -|yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-| -|yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-| -|yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-| -|yuan2-102b-instruct|[YuanLLM/Yuan2.0-102B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-102B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-| -|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-| -|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-| -|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-65b-v2|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-65b-chat|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-| -|xverse-13b-256k|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|xverse-moe-a4_2b|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|orion-14b|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✘||-| -|orion-14b-chat|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)|q_proj, k_proj, v_proj|orion|✔|✘||-| -|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-| -|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-| -|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|✘|✘||-| -|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|✘|✘||-| -|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-| -|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|✔|✔||-| -|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-| -|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|✘|✘||-| -|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|✔|✔|transformers>=4.34|-| -|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|✘|✘||-| -|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|✘|✔||-| -|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|✔|✔||-| -|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|✔|✔||financial| -|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|qwen|✔|✔||financial| -|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|financial| -|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|✔|✔||coding| -|codefuse-codegeex2-6b-chat|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B/summary)|query_key_value|codefuse|✘|✔|transformers<4.34|coding| -|codefuse-qwen-14b-chat|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B/summary)|c_attn|codefuse|✔|✔||coding| -|phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|✔|✔||coding| -|cogvlm-17b-instruct|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense|cogvlm-instruct|✘|✘||multi-modal, vision| -|cogagent-18b-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-chat|✘|✘||multi-modal, vision| -|cogagent-18b-instruct|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-instruct|✘|✘||multi-modal, vision| -|mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-| -|telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|✔|✘||-| -|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|✔|✘||-| -|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-| -|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|✔|✔|transformers>=4.36|-| -|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|✔|✔|transformers>=4.36|-| -|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|✔|✔||-| -|c4ai-command-r-v01|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01/summary)|q_proj, k_proj, v_proj|c4ai|✔|✘|transformers>=4.39.1|-| -|c4ai-command-r-plus|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus/summary)|q_proj, k_proj, v_proj|c4ai|✔|✘|transformers>4.39|-| +| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID | +| --------- | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- | +|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|✔|✔||-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)| +|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|qwen|✔|✔||-|[Qwen/Qwen-1_8B-Chat](https://huggingface.co/Qwen/Qwen-1_8B-Chat)| +|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-|[Qwen/Qwen-1_8B-Chat-Int4](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int4)| +|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-|[Qwen/Qwen-1_8B-Chat-Int8](https://huggingface.co/Qwen/Qwen-1_8B-Chat-Int8)| +|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|✔|✔||-|[Qwen/Qwen-7B](https://huggingface.co/Qwen/Qwen-7B)| +|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|qwen|✔|✔||-|[Qwen/Qwen-7B-Chat](https://huggingface.co/Qwen/Qwen-7B-Chat)| +|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-|[Qwen/Qwen-7B-Chat-Int4](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)| +|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-|[Qwen/Qwen-7B-Chat-Int8](https://huggingface.co/Qwen/Qwen-7B-Chat-Int8)| +|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|✔|✔||-|[Qwen/Qwen-14B](https://huggingface.co/Qwen/Qwen-14B)| +|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|qwen|✔|✔||-|[Qwen/Qwen-14B-Chat](https://huggingface.co/Qwen/Qwen-14B-Chat)| +|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-|[Qwen/Qwen-14B-Chat-Int4](https://huggingface.co/Qwen/Qwen-14B-Chat-Int4)| +|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-|[Qwen/Qwen-14B-Chat-Int8](https://huggingface.co/Qwen/Qwen-14B-Chat-Int8)| +|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|✔|✔||-|[Qwen/Qwen-72B](https://huggingface.co/Qwen/Qwen-72B)| +|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|qwen|✔|✔||-|[Qwen/Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat)| +|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|-|[Qwen/Qwen-72B-Chat-Int4](https://huggingface.co/Qwen/Qwen-72B-Chat-Int4)| +|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|-|[Qwen/Qwen-72B-Chat-Int8](https://huggingface.co/Qwen/Qwen-72B-Chat-Int8)| +|qwen1half-0_5b|[qwen/Qwen1.5-0.5B](https://modelscope.cn/models/qwen/Qwen1.5-0.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-0.5B](https://huggingface.co/Qwen/Qwen1.5-0.5B)| +|qwen1half-1_8b|[qwen/Qwen1.5-1.8B](https://modelscope.cn/models/qwen/Qwen1.5-1.8B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-1.8B](https://huggingface.co/Qwen/Qwen1.5-1.8B)| +|qwen1half-4b|[qwen/Qwen1.5-4B](https://modelscope.cn/models/qwen/Qwen1.5-4B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B)| +|qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-7B](https://huggingface.co/Qwen/Qwen1.5-7B)| +|qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-14B](https://huggingface.co/Qwen/Qwen1.5-14B)| +|qwen1half-32b|[qwen/Qwen1.5-32B](https://modelscope.cn/models/qwen/Qwen1.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-32B](https://huggingface.co/Qwen/Qwen1.5-32B)| +|qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-72B](https://huggingface.co/Qwen/Qwen1.5-72B)| +|codeqwen1half-7b|[qwen/CodeQwen1.5-7B](https://modelscope.cn/models/qwen/CodeQwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|-| +|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)| +|qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat)| +|qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat)| +|qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat](https://huggingface.co/Qwen/Qwen1.5-4B-Chat)| +|qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat)| +|qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat](https://huggingface.co/Qwen/Qwen1.5-14B-Chat)| +|qwen1half-32b-chat|[qwen/Qwen1.5-32B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-32B-Chat](https://huggingface.co/Qwen/Qwen1.5-32B-Chat)| +|qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat](https://huggingface.co/Qwen/Qwen1.5-72B-Chat)| +|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)| +|codeqwen1half-7b-chat|[qwen/CodeQwen1.5-7B-Chat](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37|-|-| +|qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4)| +|qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4)| +|qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int4)| +|qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)| +|qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int4)| +|qwen1half-32b-chat-int4|[qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-32B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-GPTQ-Int4)| +|qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int4)| +|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8)| +|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8)| +|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-GPTQ-Int8)| +|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int8)| +|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-GPTQ-Int8)| +|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-GPTQ-Int8)| +|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)| +|qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-0.5B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat-AWQ)| +|qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-1.8B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-AWQ)| +|qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-4B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-4B-Chat-AWQ)| +|qwen1half-7b-chat-awq|[qwen/Qwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-AWQ)| +|qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-14B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-AWQ)| +|qwen1half-32b-chat-awq|[qwen/Qwen1.5-32B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-32B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-AWQ)| +|qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-72B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-AWQ)| +|codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|transformers>=4.37, autoawq|-|-| +|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|✔|✘||multi-modal, vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)| +|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|✔|✘||multi-modal, vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)| +|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5|multi-modal, vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)| +|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|✔|✘||multi-modal, audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)| +|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|✔|✘||multi-modal, audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)| +|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔||-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)| +|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔||-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)| +|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔||-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)| +|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|✘|✔||-|[THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b)| +|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|✘|✔||-|[THUDM/chatglm3-6b-32k](https://huggingface.co/THUDM/chatglm3-6b-32k)| +|codegeex2-6b|[ZhipuAI/codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)|query_key_value|chatglm-generation|✘|✔|transformers<4.34|coding|[THUDM/codegeex2-6b](https://huggingface.co/THUDM/codegeex2-6b)| +|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)| +|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)| +|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf)| +|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-|[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)| +|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)| +|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|✔|✔||-|[meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)| +|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)| +|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)| +|llava1d6-yi-34b-instruct|[AI-ModelScope/llava-v1.6-34b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-34b/summary)|q_proj, k_proj, v_proj|llava-yi-instruct|✔|✘||multi-modal, vision|[liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b)| +|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B)| +|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K)| +|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔||-|[01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat)| +|yi-9b|[01ai/Yi-9B](https://modelscope.cn/models/01ai/Yi-9B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-9B](https://huggingface.co/01-ai/Yi-9B)| +|yi-9b-200k|[01ai/Yi-9B-200K](https://modelscope.cn/models/01ai/Yi-9B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-9B-200K](https://huggingface.co/01-ai/Yi-9B-200K)| +|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-34B](https://huggingface.co/01-ai/Yi-34B)| +|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[01-ai/Yi-34B-200K](https://huggingface.co/01-ai/Yi-34B-200K)| +|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔||-|[01-ai/Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat)| +|yi-vl-6b-chat|[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)|q_proj, k_proj, v_proj|yi-vl|✔|✘|transformers>=4.34|multi-modal, vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)| +|yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|✔|✘|transformers>=4.34|multi-modal, vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)| +|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✔||-|[internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b)| +|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-|[internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)| +|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-|-| +|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✔||-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)| +|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|✘|✔||-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)| +|internlm2-1_8b|[Shanghai_AI_Laboratory/internlm2-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-1_8b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-1_8b](https://huggingface.co/internlm/internlm2-1_8b)| +|internlm2-1_8b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b-sft/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-1_8b-sft](https://huggingface.co/internlm/internlm2-chat-1_8b-sft)| +|internlm2-1_8b-chat|[Shanghai_AI_Laboratory/internlm2-chat-1_8b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-1_8b/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-1_8b](https://huggingface.co/internlm/internlm2-chat-1_8b)| +|internlm2-7b-base|[Shanghai_AI_Laboratory/internlm2-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-7b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-base-7b](https://huggingface.co/internlm/internlm2-base-7b)| +|internlm2-7b|[Shanghai_AI_Laboratory/internlm2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-7b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-7b](https://huggingface.co/internlm/internlm2-7b)| +|internlm2-7b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b-sft/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-7b-sft](https://huggingface.co/internlm/internlm2-chat-7b-sft)| +|internlm2-7b-chat|[Shanghai_AI_Laboratory/internlm2-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-7b/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b)| +|internlm2-20b-base|[Shanghai_AI_Laboratory/internlm2-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-base-20b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-base-20b](https://huggingface.co/internlm/internlm2-base-20b)| +|internlm2-20b|[Shanghai_AI_Laboratory/internlm2-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-20b/summary)|wqkv|default-generation-bos|✔|✔||-|[internlm/internlm2-20b](https://huggingface.co/internlm/internlm2-20b)| +|internlm2-20b-sft-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b-sft](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b-sft/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-20b-sft](https://huggingface.co/internlm/internlm2-chat-20b-sft)| +|internlm2-20b-chat|[Shanghai_AI_Laboratory/internlm2-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-chat-20b/summary)|wqkv|internlm2|✔|✔||-|[internlm/internlm2-chat-20b](https://huggingface.co/internlm/internlm2-chat-20b)| +|internlm2-math-7b|[Shanghai_AI_Laboratory/internlm2-math-base-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-7b/summary)|wqkv|default-generation-bos|✔|✔||math|[internlm/internlm2-math-base-7b](https://huggingface.co/internlm/internlm2-math-base-7b)| +|internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|✔|✔||math|[internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b)| +|internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation-bos|✔|✔||math|[internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b)| +|internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|✔|✔||math|[internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b)| +|internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|✔|✘||multi-modal, vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)| +|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[deepseek-ai/deepseek-llm-7b-base](https://huggingface.co/deepseek-ai/deepseek-llm-7b-base)| +|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-|[deepseek-ai/deepseek-llm-7b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)| +|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[deepseek-ai/deepseek-moe-16b-base](https://huggingface.co/deepseek-ai/deepseek-moe-16b-base)| +|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-|[deepseek-ai/deepseek-moe-16b-chat](https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat)| +|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[deepseek-ai/deepseek-llm-67b-base](https://huggingface.co/deepseek-ai/deepseek-llm-67b-base)| +|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||-|[deepseek-ai/deepseek-llm-67b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat)| +|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding|[deepseek-ai/deepseek-coder-1.3b-base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base)| +|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-1.3b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-instruct)| +|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding|[deepseek-ai/deepseek-coder-6.7b-base](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base)| +|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)| +|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-base](https://huggingface.co/deepseek-ai/deepseek-coder-33b-base)| +|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔||coding|[deepseek-ai/deepseek-coder-33b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct)| +|deepseek-math-7b|[deepseek-ai/deepseek-math-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||math|[deepseek-ai/deepseek-math-7b-base](https://huggingface.co/deepseek-ai/deepseek-math-7b-base)| +|deepseek-math-7b-instruct|[deepseek-ai/deepseek-math-7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-instruct](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct)| +|deepseek-math-7b-chat|[deepseek-ai/deepseek-math-7b-rl](https://modelscope.cn/models/deepseek-ai/deepseek-math-7b-rl/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔||math|[deepseek-ai/deepseek-math-7b-rl](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl)| +|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|✔|✘||multi-modal, vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| +|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|✔|✘||multi-modal, vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| +|gemma-2b|[AI-ModelScope/gemma-2b](https://modelscope.cn/models/AI-ModelScope/gemma-2b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.38|-|[google/gemma-2b](https://huggingface.co/google/gemma-2b)| +|gemma-7b|[AI-ModelScope/gemma-7b](https://modelscope.cn/models/AI-ModelScope/gemma-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.38|-|[google/gemma-7b](https://huggingface.co/google/gemma-7b)| +|gemma-2b-instruct|[AI-ModelScope/gemma-2b-it](https://modelscope.cn/models/AI-ModelScope/gemma-2b-it/summary)|q_proj, k_proj, v_proj|gemma|✔|✔|transformers>=4.38|-|[google/gemma-2b-it](https://huggingface.co/google/gemma-2b-it)| +|gemma-7b-instruct|[AI-ModelScope/gemma-7b-it](https://modelscope.cn/models/AI-ModelScope/gemma-7b-it/summary)|q_proj, k_proj, v_proj|gemma|✔|✔|transformers>=4.38|-|[google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)| +|minicpm-1b-sft-chat|[OpenBMB/MiniCPM-1B-sft-bf16](https://modelscope.cn/models/OpenBMB/MiniCPM-1B-sft-bf16/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔|transformers>=4.36.0|-|[openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)| +|minicpm-2b-sft-chat|[OpenBMB/MiniCPM-2B-sft-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-sft-fp32/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔||-|[openbmb/MiniCPM-2B-sft-fp32](https://huggingface.co/openbmb/MiniCPM-2B-sft-fp32)| +|minicpm-2b-chat|[OpenBMB/MiniCPM-2B-dpo-fp32](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-dpo-fp32/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔||-|[openbmb/MiniCPM-2B-dpo-fp32](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32)| +|minicpm-2b-128k|[OpenBMB/MiniCPM-2B-128k](https://modelscope.cn/models/OpenBMB/MiniCPM-2B-128k/summary)|q_proj, k_proj, v_proj|chatml|✔|✔|transformers>=4.36.0|-|[openbmb/MiniCPM-2B-128k](https://huggingface.co/openbmb/MiniCPM-2B-128k)| +|minicpm-moe-8x2b|[OpenBMB/MiniCPM-MoE-8x2B](https://modelscope.cn/models/OpenBMB/MiniCPM-MoE-8x2B/summary)|q_proj, k_proj, v_proj|minicpm|✔|✔|transformers>=4.36.0|-|[openbmb/MiniCPM-MoE-8x2B](https://huggingface.co/openbmb/MiniCPM-MoE-8x2B)| +|minicpm-v-3b-chat|[OpenBMB/MiniCPM-V](https://modelscope.cn/models/OpenBMB/MiniCPM-V/summary)|q_proj, k_proj, v_proj|minicpm-v|✔|✘||-|[openbmb/MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V)| +|minicpm-v-v2|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|q_proj, k_proj, v_proj|minicpm-v|✔|✘||-|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)| +|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://huggingface.co/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16)| +|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama-65b-v8-bf16)| +|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://huggingface.co/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16)| +|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v17.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34|-|[OpenBuddy/openbuddy-mistral-7b-v17.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mistral-7b-v17.1-32k)| +|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34|-|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://huggingface.co/OpenBuddy/openbuddy-zephyr-7b-v14.1)| +|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔||-|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://huggingface.co/OpenBuddy/openbuddy-deepseek-67b-v15.2)| +|openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.36|-|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://huggingface.co/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k)| +|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34|-|[mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)| +|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34|-|[alpindale/Mistral-7B-v0.2-hf](https://huggingface.co/alpindale/Mistral-7B-v0.2-hf)| +|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34|-|[mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| +|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34|-|[mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)| +|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36|-|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)| +|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.36|-|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| +|mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)| +|mixtral-moe-8x22b-v1|[AI-ModelScope/Mixtral-8x22B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x22B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36|-|[mistral-community/Mixtral-8x22B-v0.1](https://huggingface.co/mistral-community/Mixtral-8x22B-v0.1)| +|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|✘|✔|transformers<4.34|-|[baichuan-inc/Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B)| +|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|✘|✔|transformers<4.34|-|[baichuan-inc/Baichuan-13B-Base](https://huggingface.co/baichuan-inc/Baichuan-13B-Base)| +|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|✘|✔|transformers<4.34|-|[baichuan-inc/Baichuan-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat)| +|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|✘|✔||-|[baichuan-inc/Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)| +|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|✘|✔||-|[baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)| +|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|✘|✘|bitsandbytes<0.41.2, accelerate<0.26|-|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat-4bits)| +|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|✘|✔||-|[baichuan-inc/Baichuan2-13B-Base](https://huggingface.co/baichuan-inc/Baichuan2-13B-Base)| +|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|✘|✔||-|[baichuan-inc/Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat)| +|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|✘|✘|bitsandbytes<0.41.2, accelerate<0.26|-|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat-4bits)| +|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|transformers<4.35, icecream|-|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)| +|mplug-owl2d1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|transformers<4.35, icecream|-|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)| +|yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)| +|yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-|[IEITYuan/Yuan2-2B-Janus-hf](https://huggingface.co/IEITYuan/Yuan2-2B-Janus-hf)| +|yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)| +|yuan2-102b-instruct|[YuanLLM/Yuan2.0-102B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-102B-hf/summary)|q_proj, k_proj, v_proj|yuan|✔|✘||-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)| +|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-7B](https://huggingface.co/xverse/XVERSE-7B)| +|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-|[xverse/XVERSE-7B-Chat](https://huggingface.co/xverse/XVERSE-7B-Chat)| +|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-13B](https://huggingface.co/xverse/XVERSE-13B)| +|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-|[xverse/XVERSE-13B-Chat](https://huggingface.co/xverse/XVERSE-13B-Chat)| +|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-65B](https://huggingface.co/xverse/XVERSE-65B)| +|xverse-65b-v2|[xverse/XVERSE-65B-2](https://modelscope.cn/models/xverse/XVERSE-65B-2/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-65B-2](https://huggingface.co/xverse/XVERSE-65B-2)| +|xverse-65b-chat|[xverse/XVERSE-65B-Chat](https://modelscope.cn/models/xverse/XVERSE-65B-Chat/summary)|q_proj, k_proj, v_proj|xverse|✘|✘||-|[xverse/XVERSE-65B-Chat](https://huggingface.co/xverse/XVERSE-65B-Chat)| +|xverse-13b-256k|[xverse/XVERSE-13B-256K](https://modelscope.cn/models/xverse/XVERSE-13B-256K/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-13B-256K](https://huggingface.co/xverse/XVERSE-13B-256K)| +|xverse-moe-a4_2b|[xverse/XVERSE-MoE-A4.2B](https://modelscope.cn/models/xverse/XVERSE-MoE-A4.2B/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[xverse/XVERSE-MoE-A4.2B](https://huggingface.co/xverse/XVERSE-MoE-A4.2B)| +|orion-14b|[OrionStarAI/Orion-14B-Base](https://modelscope.cn/models/OrionStarAI/Orion-14B-Base/summary)|q_proj, k_proj, v_proj|default-generation|✔|✘||-|[OrionStarAI/Orion-14B-Base](https://huggingface.co/OrionStarAI/Orion-14B-Base)| +|orion-14b-chat|[OrionStarAI/Orion-14B-Chat](https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat/summary)|q_proj, k_proj, v_proj|orion|✔|✘||-|[OrionStarAI/Orion-14B-Chat](https://huggingface.co/OrionStarAI/Orion-14B-Chat)| +|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-|[vivo-ai/BlueLM-7B-Base](https://huggingface.co/vivo-ai/BlueLM-7B-Base)| +|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-|[vivo-ai/BlueLM-7B-Base-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Base-32K)| +|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|✘|✘||-|[vivo-ai/BlueLM-7B-Chat](https://huggingface.co/vivo-ai/BlueLM-7B-Chat)| +|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|✘|✘||-|[vivo-ai/BlueLM-7B-Chat-32K](https://huggingface.co/vivo-ai/BlueLM-7B-Chat-32K)| +|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[IDEA-CCNL/Ziya2-13B-Base](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Base)| +|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|✔|✔||-|[IDEA-CCNL/Ziya2-13B-Chat](https://huggingface.co/IDEA-CCNL/Ziya2-13B-Chat)| +|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✘|✘||-|[Skywork/Skywork-13B-base](https://huggingface.co/Skywork/Skywork-13B-base)| +|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|✘|✘||-|-| +|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|✔|✔|transformers>=4.34|-|[HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)| +|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|✘|✘||-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)| +|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|✘|✔||-|[DAMO-NLP/SeqGPT-560M](https://huggingface.co/DAMO-NLP/SeqGPT-560M)| +|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|✔|✔||-|[SUSTech/SUS-Chat-34B](https://huggingface.co/SUSTech/SUS-Chat-34B)| +|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|✔|✔||financial|-| +|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|qwen|✔|✔||financial|[jxy/Tongyi-Finance-14B-Chat](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat)| +|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|✔|✔|auto_gptq>=0.5|financial|[jxy/Tongyi-Finance-14B-Chat-Int4](https://huggingface.co/jxy/Tongyi-Finance-14B-Chat-Int4)| +|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|✔|✔||coding|[codefuse-ai/CodeFuse-CodeLlama-34B](https://huggingface.co/codefuse-ai/CodeFuse-CodeLlama-34B)| +|codefuse-codegeex2-6b-chat|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeGeeX2-6B/summary)|query_key_value|codefuse|✘|✔|transformers<4.34|coding|[codefuse-ai/CodeFuse-CodeGeeX2-6B](https://huggingface.co/codefuse-ai/CodeFuse-CodeGeeX2-6B)| +|codefuse-qwen-14b-chat|[codefuse-ai/CodeFuse-QWen-14B](https://modelscope.cn/models/codefuse-ai/CodeFuse-QWen-14B/summary)|c_attn|codefuse|✔|✔||coding|[codefuse-ai/CodeFuse-QWen-14B](https://huggingface.co/codefuse-ai/CodeFuse-QWen-14B)| +|phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|✔|✔||coding|[microsoft/phi-2](https://huggingface.co/microsoft/phi-2)| +|cogvlm-17b-instruct|[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense|cogvlm-instruct|✘|✘||multi-modal, vision|[THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)| +|cogagent-18b-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-chat|✘|✘||multi-modal, vision|[THUDM/cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)| +|cogagent-18b-instruct|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent-instruct|✘|✘||multi-modal, vision|[THUDM/cogagent-vqa-hf](https://huggingface.co/THUDM/cogagent-vqa-hf)| +|mamba-130m|[AI-ModelScope/mamba-130m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-130m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-130m-hf](https://huggingface.co/state-spaces/mamba-130m-hf)| +|mamba-370m|[AI-ModelScope/mamba-370m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-370m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-370m-hf](https://huggingface.co/state-spaces/mamba-370m-hf)| +|mamba-390m|[AI-ModelScope/mamba-390m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-390m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-390m-hf](https://huggingface.co/state-spaces/mamba-390m-hf)| +|mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-790m-hf](https://huggingface.co/state-spaces/mamba-790m-hf)| +|mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)| +|mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|✘|✘|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)| +|telechat-7b|[TeleAI/TeleChat-7B](https://modelscope.cn/models/TeleAI/TeleChat-7B/summary)|key_value, query|telechat|✔|✘||-|[Tele-AI/telechat-7B](https://huggingface.co/Tele-AI/telechat-7B)| +|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|key_value, query|telechat|✔|✘||-|[Tele-AI/TeleChat-12B](https://huggingface.co/Tele-AI/TeleChat-12B)| +|grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|✘|✘||-|[hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1)| +|dbrx-instruct|[AI-ModelScope/dbrx-instruct](https://modelscope.cn/models/AI-ModelScope/dbrx-instruct/summary)|attn.Wqkv|dbrx|✔|✔|transformers>=4.36|-|[databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct)| +|dbrx-base|[AI-ModelScope/dbrx-base](https://modelscope.cn/models/AI-ModelScope/dbrx-base/summary)|attn.Wqkv|dbrx|✔|✔|transformers>=4.36|-|[databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)| +|mengzi3-13b-base|[langboat/Mengzi3-13B-Base](https://modelscope.cn/models/langboat/Mengzi3-13B-Base/summary)|q_proj, k_proj, v_proj|mengzi|✔|✔||-|[Langboat/Mengzi3-13B-Base](https://huggingface.co/Langboat/Mengzi3-13B-Base)| +|c4ai-command-r-v01|[AI-ModelScope/c4ai-command-r-v01](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-v01/summary)|q_proj, k_proj, v_proj|c4ai|✔|✘|transformers>=4.39.1|-|[CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)| +|c4ai-command-r-plus|[AI-ModelScope/c4ai-command-r-plus](https://modelscope.cn/models/AI-ModelScope/c4ai-command-r-plus/summary)|q_proj, k_proj, v_proj|c4ai|✔|✘|transformers>4.39|-|[CohereForAI/c4ai-command-r-plus](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| ## dataset @@ -233,95 +235,95 @@ The table below introduces the datasets supported by SWIFT: - Size: The data row count of the dataset. - Statistic: Dataset statistics. We use the number of tokens for statistics, which helps adjust the max_length hyperparameter. We concatenate the training and validation sets of the dataset and then compute the statistics. We use qwen's tokenizer to tokenize the dataset. Different tokenizers produce different statistics. If you want to obtain token statistics for tokenizers of other models, you can use the script to get them yourself. -| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags | -| ------------ | ---------- | ---------- | -------- | ----------------- | ---- | -|🔥ms-bench|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|316228|0|345.0±441.3, min=22, max=30960|chat, general, multi-round| -|🔥ms-bench-mini|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|19492|0|353.9±439.4, min=29, max=12078|chat, general, multi-round| -|🔥alpaca-en|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)|52002|0|176.2±125.8, min=26, max=740|chat, general| -|🔥alpaca-zh|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)|48818|0|162.1±93.9, min=26, max=856|chat, general| -|multi-alpaca-all|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary)|131867|0|112.9±50.6, min=26, max=1226|chat, general, multilingual| -|instinwild-en|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|52191|0|160.2±69.7, min=33, max=763|chat, general| -|instinwild-zh|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|51504|0|130.3±45.1, min=28, max=1434|chat, general| -|cot-en|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT/summary)|74771|0|122.7±64.8, min=51, max=8320|chat, general| -|cot-zh|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh/summary)|74771|0|117.5±70.8, min=43, max=9636|chat, general| -|firefly-all-zh|[wyj123456/firefly](https://modelscope.cn/datasets/wyj123456/firefly/summary)|1649399|0|178.1±260.4, min=26, max=12516|chat, general| -|instruct-en|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct/summary)|888970|0|268.9±331.2, min=26, max=7252|chat, general| -|gpt4all-en|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all/summary)|806199|0|302.5±384.1, min=27, max=7391|chat, general| -|sharegpt-en|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|99799|0|1045.7±431.9, min=22, max=7907|chat, general, multi-round| -|sharegpt-zh|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|135399|0|806.3±771.7, min=21, max=65318|chat, general, multi-round| -|tulu-v2-sft-mixture|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary)|326154|0|867.8±996.4, min=22, max=12111|chat, multilingual, general, multi-round| -|wikipedia-zh|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary)|254547|0|568.4±713.2, min=37, max=78678|text-generation, general, pretrained| -|open-orca|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|3239027|0|360.4±402.9, min=27, max=8672|chat, multilingual, general| -|open-orca-gpt4|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|994896|0|382.3±417.4, min=31, max=8740|chat, multilingual, general| -|sharegpt-gpt4|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|103063|0|1286.2±2089.4, min=22, max=221080|chat, multilingual, general, multi-round| -|🔥sharegpt-gpt4-mini|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|6205|0|3511.6±6068.5, min=33, max=116018|chat, multilingual, general, multi-round, gpt4| -|🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)|30000|0|647.7±217.1, min=199, max=2722|chat, agent, multi-round| -|ms-agent-for-agentfabric-default|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|30000|0|617.8±199.1, min=251, max=2657|chat, agent, multi-round| -|ms-agent-for-agentfabric-addition|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|488|0|2084.9±1514.8, min=489, max=7354|chat, agent, multi-round| -|damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|422115|161|965.7±440.9, min=321, max=31535|chat, agent, multi-round| -|damo-agent-mini-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|39964|152|1230.9±350.1, min=558, max=4982|chat, agent, multi-round| -|agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|1866|0|1144.3±635.5, min=206, max=6412|chat, agent, multi-round| -|code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)|20016|0|100.1±60.1, min=29, max=1776|chat, coding| -|🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)|2359|0|723.8±233.5, min=259, max=2117|chat, coding| -|🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)|27224|0|483.6±193.9, min=45, max=3082|chat, coding| -|🔥codefuse-evol-instruction-zh|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)|66862|0|439.6±206.3, min=37, max=2983|chat, coding| -|medical-en|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|117117|500|257.4±89.1, min=36, max=2564|chat, medical| -|medical-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|1950472|500|167.2±219.7, min=26, max=27351|chat, medical| -|medical-mini-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|50000|500|168.1±220.8, min=26, max=12320|chat, medical| -|🔥disc-med-sft-zh|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT/summary)|441767|0|354.1±193.1, min=25, max=2231|chat, medical| -|lawyer-llama-zh|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary)|21476|0|194.4±91.7, min=27, max=924|chat, law| -|tigerbot-law-zh|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)|55895|0|109.9±126.4, min=37, max=18878|text-generation, law, pretrained| -|🔥disc-law-sft-zh|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT/summary)|166758|0|533.7±495.4, min=30, max=15169|chat, law| -|🔥blossom-math-zh|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary)|10000|0|169.3±58.7, min=35, max=563|chat, math| -|school-math-zh|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)|248480|0|157.6±72.1, min=33, max=3450|chat, math| -|open-platypus-en|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus/summary)|24926|0|367.9±254.8, min=30, max=3951|chat, math| -|text2sql-en|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary)|25000|0|274.6±326.4, min=38, max=1975|chat, sql| -|🔥sql-create-context-en|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)|78577|0|80.2±17.8, min=36, max=456|chat, sql| -|🔥advertise-gen-zh|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)|97484|915|131.6±21.7, min=52, max=242|text-generation| -|🔥dureader-robust-zh|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)|15937|1962|242.1±137.4, min=61, max=1417|text-generation| -|cmnli-zh|[clue](https://modelscope.cn/datasets/clue/summary)|391783|12241|83.6±16.6, min=52, max=200|text-generation, classification| -|🔥cmnli-mini-zh|[clue](https://modelscope.cn/datasets/clue/summary)|20000|200|82.9±16.3, min=52, max=188|text-generation, classification| -|🔥jd-sentiment-zh|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)|45012|4988|67.0±83.2, min=40, max=4040|text-generation, classification| -|🔥hc3-zh|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese/summary)|39781|0|177.8±81.5, min=58, max=3052|text-generation, classification| -|🔥hc3-en|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3/summary)|11021|0|299.3±138.7, min=66, max=2268|text-generation, classification| -|finance-en|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en/summary)|68911|0|135.6±134.3, min=26, max=3525|chat, financial| -|poetry-zh|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary)|388599|1710|55.2±9.4, min=23, max=83|text-generation, poetry| -|webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)|50000|0|1478.9±11526.1, min=100, max=490484|chat, novel| -|generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)|396004|0|273.3±52.0, min=32, max=873|chat, character-dialogue| -|cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)|4959|0|3234.4±2547.5, min=91, max=19548|chat, classification| -|ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner| -|long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)|11998|0|9619.0±8295.8, min=36, max=78925|longlora, QA| -|coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision| -|🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision| -|🔥coco-mini-en-2|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|36.8±2.8, min=32, max=77|chat, multi-modal, vision| -|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision| -|aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio| -|🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio| -|hh-rlhf-harmless-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42462|2308|167.2±123.1, min=22, max=986|rlhf, dpo, pairwise| -|hh-rlhf-helpful-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|43777|2348|201.9±135.2, min=25, max=1070|rlhf, dpo, pairwise| -|hh-rlhf-helpful-online|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|10150|1137|401.5±278.7, min=32, max=1987|rlhf, dpo, pairwise| -|hh-rlhf-helpful-rejection-sampled|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise| -|hh-rlhf-red-team-attempts|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise| -|🔥hh-rlhf-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|172085|9292|172.8±124.0, min=22, max=1638|rlhf, dpo, pairwise| -|hh-rlhf-cn-harmless-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|143.9±109.4, min=24, max=3078|rlhf, dpo, pairwise| -|hh-rlhf-cn-helpful-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|176.8±120.0, min=26, max=1420|rlhf, dpo, pairwise| -|hh-rlhf-cn-harmless-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|167.5±123.2, min=22, max=986|rlhf, dpo, pairwise| -|hh-rlhf-cn-helpful-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|202.2±135.3, min=25, max=1070|rlhf, dpo, pairwise| -|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise| -|pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)|214670|0|1612.3±8856.2, min=11, max=1208955|text-generation, awq| -|🔥coig-cqia-chinese-traditional|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1111|0|172.6±59.9, min=55, max=856|general| -|🔥coig-cqia-coig-pc|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3000|0|353.5±859.6, min=34, max=19288|general| -|🔥coig-cqia-exam|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|4856|0|275.0±240.0, min=45, max=4932|general| -|🔥coig-cqia-finance|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|11288|0|1266.4±561.1, min=60, max=10582|general| -|🔥coig-cqia-douban|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3086|0|402.9±544.7, min=88, max=10870|general| -|🔥coig-cqia-human-value|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1007|0|151.2±77.3, min=39, max=656|general| -|🔥coig-cqia-logi-qa|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|421|0|309.8±188.8, min=43, max=1306|general| -|🔥coig-cqia-ruozhiba|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|240|0|189.8±62.2, min=33, max=505|general| -|🔥coig-cqia-segmentfault|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|458|0|449.0±495.8, min=87, max=6342|general| -|🔥coig-cqia-wiki|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|10603|0|619.2±515.8, min=73, max=10140|general| -|🔥coig-cqia-wikihow|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1485|0|1700.0±790.9, min=260, max=6371|general| -|🔥coig-cqia-xhs|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1508|0|438.0±179.6, min=129, max=2191|general| -|🔥coig-cqia-zhihu|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|5631|0|540.7±306.7, min=161, max=3036|general| -|🔥ruozhiba-post-annual|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|1361|0|36.6±15.3, min=24, max=559|pretrain| -|🔥ruozhiba-title-good|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|2597|0|41.9±19.3, min=22, max=246|pretrain| -|🔥ruozhiba-title-norm|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|81700|0|39.9±12.8, min=21, max=386|pretrain| +| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags | HF Dataset ID | +| ------------ | ---------- | ---------- | -------- | ----------------- | ---- | ------------- | +|🔥ms-bench|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|316228|0|345.0±441.3, min=22, max=30960|chat, general, multi-round|-| +|🔥ms-bench-mini|[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench/summary)|19492|0|353.9±439.4, min=29, max=12078|chat, general, multi-round|-| +|🔥alpaca-en|[AI-ModelScope/alpaca-gpt4-data-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)|52002|0|176.2±125.8, min=26, max=740|chat, general|[vicgalle/alpaca-gpt4](https://huggingface.co/datasets/vicgalle/alpaca-gpt4)| +|🔥alpaca-zh|[AI-ModelScope/alpaca-gpt4-data-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)|48818|0|162.1±93.9, min=26, max=856|chat, general|[c-s-ale/alpaca-gpt4-data-zh](https://huggingface.co/datasets/c-s-ale/alpaca-gpt4-data-zh)| +|multi-alpaca-all|[damo/nlp_polylm_multialpaca_sft](https://modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary)|131867|0|112.9±50.6, min=26, max=1226|chat, general, multilingual|-| +|instinwild-en|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|52191|0|160.2±69.7, min=33, max=763|chat, general|-| +|instinwild-zh|[wyj123456/instinwild](https://modelscope.cn/datasets/wyj123456/instinwild/summary)|51504|0|130.3±45.1, min=28, max=1434|chat, general|-| +|cot-en|[YorickHe/CoT](https://modelscope.cn/datasets/YorickHe/CoT/summary)|74771|0|122.7±64.8, min=51, max=8320|chat, general|-| +|cot-zh|[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh/summary)|74771|0|117.5±70.8, min=43, max=9636|chat, general|-| +|firefly-all-zh|[wyj123456/firefly](https://modelscope.cn/datasets/wyj123456/firefly/summary)|1649399|0|178.1±260.4, min=26, max=12516|chat, general|-| +|instruct-en|[wyj123456/instruct](https://modelscope.cn/datasets/wyj123456/instruct/summary)|888970|0|268.9±331.2, min=26, max=7252|chat, general|-| +|gpt4all-en|[wyj123456/GPT4all](https://modelscope.cn/datasets/wyj123456/GPT4all/summary)|806199|0|302.5±384.1, min=27, max=7391|chat, general|-| +|sharegpt-en|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|99799|0|1045.7±431.9, min=22, max=7907|chat, general, multi-round|-| +|sharegpt-zh|[huangjintao/sharegpt](https://modelscope.cn/datasets/huangjintao/sharegpt/summary)|135399|0|806.3±771.7, min=21, max=65318|chat, general, multi-round|-| +|tulu-v2-sft-mixture|[AI-ModelScope/tulu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary)|326154|0|867.8±996.4, min=22, max=12111|chat, multilingual, general, multi-round|[allenai/tulu-v2-sft-mixture](https://huggingface.co/datasets/allenai/tulu-v2-sft-mixture)| +|wikipedia-zh|[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary)|254547|0|568.4±713.2, min=37, max=78678|text-generation, general, pretrained|[pleisto/wikipedia-cn-20230720-filtered](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)| +|open-orca|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|3239027|0|360.4±402.9, min=27, max=8672|chat, multilingual, general|-| +|open-orca-gpt4|[AI-ModelScope/OpenOrca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary)|994896|0|382.3±417.4, min=31, max=8740|chat, multilingual, general|-| +|sharegpt-gpt4|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|103063|0|1286.2±2089.4, min=22, max=221080|chat, multilingual, general, multi-round|-| +|🔥sharegpt-gpt4-mini|[AI-ModelScope/sharegpt_gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)|6205|0|3511.6±6068.5, min=33, max=116018|chat, multilingual, general, multi-round, gpt4|-| +|🔥ms-agent|[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent/summary)|30000|0|647.7±217.1, min=199, max=2722|chat, agent, multi-round|-| +|ms-agent-for-agentfabric-default|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|30000|0|617.8±199.1, min=251, max=2657|chat, agent, multi-round|-| +|ms-agent-for-agentfabric-addition|[AI-ModelScope/ms_agent_for_agentfabric](https://modelscope.cn/datasets/AI-ModelScope/ms_agent_for_agentfabric/summary)|488|0|2084.9±1514.8, min=489, max=7354|chat, agent, multi-round|-| +|damo-agent-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|422115|161|965.7±440.9, min=321, max=31535|chat, agent, multi-round|-| +|damo-agent-mini-zh|[damo/MSAgent-Bench](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary)|39964|152|1230.9±350.1, min=558, max=4982|chat, agent, multi-round|-| +|agent-instruct-all-en|[huangjintao/AgentInstruct_copy](https://modelscope.cn/datasets/huangjintao/AgentInstruct_copy/summary)|1866|0|1144.3±635.5, min=206, max=6412|chat, agent, multi-round|-| +|code-alpaca-en|[wyj123456/code_alpaca_en](https://modelscope.cn/datasets/wyj123456/code_alpaca_en/summary)|20016|0|100.1±60.1, min=29, max=1776|chat, coding|[sahil2801/CodeAlpaca-20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)| +|🔥leetcode-python-en|[AI-ModelScope/leetcode-solutions-python](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary)|2359|0|723.8±233.5, min=259, max=2117|chat, coding|-| +|🔥codefuse-python-en|[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary)|27224|0|483.6±193.9, min=45, max=3082|chat, coding|-| +|🔥codefuse-evol-instruction-zh|[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)|66862|0|439.6±206.3, min=37, max=2983|chat, coding|-| +|medical-en|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|117117|500|257.4±89.1, min=36, max=2564|chat, medical|-| +|medical-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|1950472|500|167.2±219.7, min=26, max=27351|chat, medical|-| +|medical-mini-zh|[huangjintao/medical_zh](https://modelscope.cn/datasets/huangjintao/medical_zh/summary)|50000|500|168.1±220.8, min=26, max=12320|chat, medical|-| +|🔥disc-med-sft-zh|[AI-ModelScope/DISC-Med-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Med-SFT/summary)|441767|0|354.1±193.1, min=25, max=2231|chat, medical|[Flmc/DISC-Med-SFT](https://huggingface.co/datasets/Flmc/DISC-Med-SFT)| +|lawyer-llama-zh|[AI-ModelScope/lawyer_llama_data](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary)|21476|0|194.4±91.7, min=27, max=924|chat, law|[Skepsun/lawyer_llama_data](https://huggingface.co/datasets/Skepsun/lawyer_llama_data)| +|tigerbot-law-zh|[AI-ModelScope/tigerbot-law-plugin](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)|55895|0|109.9±126.4, min=37, max=18878|text-generation, law, pretrained|[TigerResearch/tigerbot-law-plugin](https://huggingface.co/datasets/TigerResearch/tigerbot-law-plugin)| +|🔥disc-law-sft-zh|[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT/summary)|166758|0|533.7±495.4, min=30, max=15169|chat, law|-| +|🔥blossom-math-zh|[AI-ModelScope/blossom-math-v2](https://modelscope.cn/datasets/AI-ModelScope/blossom-math-v2/summary)|10000|0|169.3±58.7, min=35, max=563|chat, math|[Azure99/blossom-math-v2](https://huggingface.co/datasets/Azure99/blossom-math-v2)| +|school-math-zh|[AI-ModelScope/school_math_0.25M](https://modelscope.cn/datasets/AI-ModelScope/school_math_0.25M/summary)|248480|0|157.6±72.1, min=33, max=3450|chat, math|[BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)| +|open-platypus-en|[AI-ModelScope/Open-Platypus](https://modelscope.cn/datasets/AI-ModelScope/Open-Platypus/summary)|24926|0|367.9±254.8, min=30, max=3951|chat, math|[garage-bAInd/Open-Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)| +|text2sql-en|[AI-ModelScope/texttosqlv2_25000_v2](https://modelscope.cn/datasets/AI-ModelScope/texttosqlv2_25000_v2/summary)|25000|0|274.6±326.4, min=38, max=1975|chat, sql|[Clinton/texttosqlv2_25000_v2](https://huggingface.co/datasets/Clinton/texttosqlv2_25000_v2)| +|🔥sql-create-context-en|[AI-ModelScope/sql-create-context](https://modelscope.cn/datasets/AI-ModelScope/sql-create-context/summary)|78577|0|80.2±17.8, min=36, max=456|chat, sql|[b-mc2/sql-create-context](https://huggingface.co/datasets/b-mc2/sql-create-context)| +|🔥advertise-gen-zh|[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen/summary)|97484|915|131.6±21.7, min=52, max=242|text-generation|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)| +|🔥dureader-robust-zh|[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG/summary)|15937|1962|242.1±137.4, min=61, max=1417|text-generation|-| +|cmnli-zh|[clue](https://modelscope.cn/datasets/clue/summary)|391783|12241|83.6±16.6, min=52, max=200|text-generation, classification|[clue](https://huggingface.co/datasets/clue)| +|🔥cmnli-mini-zh|[clue](https://modelscope.cn/datasets/clue/summary)|20000|200|82.9±16.3, min=52, max=188|text-generation, classification|[clue](https://huggingface.co/datasets/clue)| +|🔥jd-sentiment-zh|[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd/summary)|45012|4988|67.0±83.2, min=40, max=4040|text-generation, classification|-| +|🔥hc3-zh|[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese/summary)|39781|0|177.8±81.5, min=58, max=3052|text-generation, classification|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)| +|🔥hc3-en|[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3/summary)|11021|0|299.3±138.7, min=66, max=2268|text-generation, classification|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)| +|finance-en|[wyj123456/finance_en](https://modelscope.cn/datasets/wyj123456/finance_en/summary)|68911|0|135.6±134.3, min=26, max=3525|chat, financial|[ssbuild/alpaca_finance_en](https://huggingface.co/datasets/ssbuild/alpaca_finance_en)| +|poetry-zh|[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection/summary)|388599|1710|55.2±9.4, min=23, max=83|text-generation, poetry|-| +|webnovel-zh|[AI-ModelScope/webnovel_cn](https://modelscope.cn/datasets/AI-ModelScope/webnovel_cn/summary)|50000|0|1478.9±11526.1, min=100, max=490484|chat, novel|[zxbsmk/webnovel_cn](https://huggingface.co/datasets/zxbsmk/webnovel_cn)| +|generated-chat-zh|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)|396004|0|273.3±52.0, min=32, max=873|chat, character-dialogue|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)| +|cls-fudan-news-zh|[damo/zh_cls_fudan-news](https://modelscope.cn/datasets/damo/zh_cls_fudan-news/summary)|4959|0|3234.4±2547.5, min=91, max=19548|chat, classification|-| +|ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner|-| +|long-alpaca-12k|[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/summary)|11998|0|9619.0±8295.8, min=36, max=78925|longlora, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)| +|coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision|-| +|🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision|-| +|🔥coco-mini-en-2|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|36.8±2.8, min=32, max=77|chat, multi-modal, vision|-| +|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision|-| +|aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio|-| +|🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio|-| +|hh-rlhf-harmless-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42462|2308|167.2±123.1, min=22, max=986|rlhf, dpo, pairwise|-| +|hh-rlhf-helpful-base|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|43777|2348|201.9±135.2, min=25, max=1070|rlhf, dpo, pairwise|-| +|hh-rlhf-helpful-online|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|10150|1137|401.5±278.7, min=32, max=1987|rlhf, dpo, pairwise|-| +|hh-rlhf-helpful-rejection-sampled|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|-| +|hh-rlhf-red-team-attempts|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|52413|2749|247.0±152.6, min=26, max=1300|rlhf, dpo, pairwise|-| +|🔥hh-rlhf-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|172085|9292|172.8±124.0, min=22, max=1638|rlhf, dpo, pairwise|-| +|hh-rlhf-cn-harmless-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|143.9±109.4, min=24, max=3078|rlhf, dpo, pairwise|-| +|hh-rlhf-cn-helpful-base-cn|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|176.8±120.0, min=26, max=1420|rlhf, dpo, pairwise|-| +|hh-rlhf-cn-harmless-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|42394|2304|167.5±123.2, min=22, max=986|rlhf, dpo, pairwise|-| +|hh-rlhf-cn-helpful-base-en|[AI-ModelScope/hh_rlhf_cn](https://modelscope.cn/datasets/AI-ModelScope/hh_rlhf_cn/summary)|43722|2346|202.2±135.3, min=25, max=1070|rlhf, dpo, pairwise|-| +|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|-| +|pileval|[huangjintao/pile-val-backup](https://modelscope.cn/datasets/huangjintao/pile-val-backup/summary)|214670|0|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)| +|🔥coig-cqia-chinese-traditional|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1111|0|172.6±59.9, min=55, max=856|general|-| +|🔥coig-cqia-coig-pc|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3000|0|353.5±859.6, min=34, max=19288|general|-| +|🔥coig-cqia-exam|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|4856|0|275.0±240.0, min=45, max=4932|general|-| +|🔥coig-cqia-finance|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|11288|0|1266.4±561.1, min=60, max=10582|general|-| +|🔥coig-cqia-douban|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|3086|0|402.9±544.7, min=88, max=10870|general|-| +|🔥coig-cqia-human-value|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1007|0|151.2±77.3, min=39, max=656|general|-| +|🔥coig-cqia-logi-qa|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|421|0|309.8±188.8, min=43, max=1306|general|-| +|🔥coig-cqia-ruozhiba|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|240|0|189.8±62.2, min=33, max=505|general|-| +|🔥coig-cqia-segmentfault|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|458|0|449.0±495.8, min=87, max=6342|general|-| +|🔥coig-cqia-wiki|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|10603|0|619.2±515.8, min=73, max=10140|general|-| +|🔥coig-cqia-wikihow|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1485|0|1700.0±790.9, min=260, max=6371|general|-| +|🔥coig-cqia-xhs|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|1508|0|438.0±179.6, min=129, max=2191|general|-| +|🔥coig-cqia-zhihu|[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/summary)|5631|0|540.7±306.7, min=161, max=3036|general|-| +|🔥ruozhiba-post-annual|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|1361|0|36.6±15.3, min=24, max=559|pretrain|-| +|🔥ruozhiba-title-good|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|2597|0|41.9±19.3, min=22, max=246|pretrain|-| +|🔥ruozhiba-title-norm|[AI-ModelScope/ruozhiba](https://modelscope.cn/datasets/AI-ModelScope/ruozhiba/summary)|81700|0|39.9±12.8, min=21, max=386|pretrain|-| diff --git a/scripts/utils/run_dataset_info.py b/scripts/utils/run_dataset_info.py index a867bf2600..9255bfaad2 100644 --- a/scripts/utils/run_dataset_info.py +++ b/scripts/utils/run_dataset_info.py @@ -30,10 +30,10 @@ def write_dataset_info() -> None: res_text_list = [] res_text_list.append( - '| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags |' + '| Dataset Name | Dataset ID | Train Size | Val Size | Statistic (token) | Tags | HF Dataset ID |' ) res_text_list.append( - '| ------------ | ---------- | ---------- | -------- | ----------------- | ---- |' + '| ------------ | ---------- | ---------- | -------- | ----------------- | ---- | ------------- |' ) if len(text_list) >= 2: text_list = text_list[2:] @@ -88,7 +88,7 @@ def write_dataset_info() -> None: _token_len.append(len(input_ids[i])) stat = stat_array(_token_len)[0] stat_str = f"{stat['mean']:.1f}±{stat['std']:.1f}, min={stat['min']}, max={stat['max']}" - url = f"https://modelscope.cn/datasets/{dataset_info['dataset_id_or_path']}/summary" + ms_url = f"https://modelscope.cn/datasets/{dataset_info['dataset_id_or_path']}/summary" if '🔥' in tags: tags.remove('🔥') @@ -96,9 +96,17 @@ def write_dataset_info() -> None: tags_str = ', '.join(tags) if len(tags_str) == 0: tags_str = '-' + hf_dataset_id = dataset_info.get('hf_dataset_id') + if hf_dataset_id is None: + hf_dataset_id = '-' + hf_dataset_id_str = '-' + else: + hf_url = f'https://huggingface.co/datasets/{hf_dataset_id}' + hf_dataset_id_str = f'[{hf_dataset_id}]({hf_url})' + res_text_list.append( - f"|{dataset_name}|[{dataset_info['dataset_id_or_path']}]({url})|{train_size}|" - f'{val_size}|{stat_str}|{tags_str}|') + f"|{dataset_name}|[{dataset_info['dataset_id_or_path']}]({ms_url})|{train_size}|" + f'{val_size}|{stat_str}|{tags_str}|{hf_dataset_id_str}|') print(f'数据集总数: {len(dataset_name_list)}') for idx in range(len(fpaths)): diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py index 63a636d49c..09a7f6b49f 100644 --- a/scripts/utils/run_model_info.py +++ b/scripts/utils/run_model_info.py @@ -12,9 +12,10 @@ def get_model_info_table() -> List[str]: model_name_list = ModelType.get_model_name_list() result = ( '| Model Type | Model ID | Default Lora Target Modules | Default Template |' - ' Support Flash Attn | Support VLLM | Requires | Tags |\n' + ' Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |\n' '| --------- | -------- | --------------------------- | ---------------- |' - ' ------------------ | ------------ | -------- | ---- |\n') + ' ------------------ | ------------ | -------- | ---- | ----------- |\n' + ) res: List[str] = [] bool_mapping = {True: '✔', False: '✘'} for model_name in model_name_list: @@ -31,15 +32,23 @@ def get_model_info_table() -> List[str]: tags_str = ', '.join(tags) if len(tags_str) == 0: tags_str = '-' + hf_model_id = model_info.get('hf_model_id') + if hf_model_id is None: + hf_model_id = '-' r = [ model_name, model_id, lora_target_modules, template, - support_flash_attn, support_vllm, requires, tags_str + support_flash_attn, support_vllm, requires, tags_str, hf_model_id ] res.append(r) text = '' for r in res: - url = f'https://modelscope.cn/models/{r[1]}/summary' - text += f'|{r[0]}|[{r[1]}]({url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|{r[7]}|\n' + ms_url = f'https://modelscope.cn/models/{r[1]}/summary' + if r[8] != '-': + hf_url = f'https://huggingface.co/{r[8]}' + hf_model_id_str = f'[{r[8]}]({hf_url})' + else: + hf_model_id_str = '-' + text += f'|{r[0]}|[{r[1]}]({ms_url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|{r[7]}|{hf_model_id_str}|\n' print(f'模型总数: {len(res)}') result += text for idx, fpath in enumerate(fpaths): diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index 9fab7662bd..72b0d57b2e 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -274,7 +274,8 @@ def set_model_type(self: Union['SftArguments', 'InferArguments']) -> None: model_info['revision'] = 'main' self.model_revision = model_info['revision'] if self.model_id_or_path is None: - self.model_id_or_path = model_info['model_id_or_path'] + self.model_id_or_path = model_info[ + 'hf_model_id'] if use_hf else model_info['model_id_or_path'] requires = model_info['requires'] for require in requires: require_version(require) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 9860aa7e27..6ca45e74f9 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -1554,10 +1554,16 @@ def get_dataset( for dataset_name in dataset_name_list: dataset_info = DATASET_MAPPING[dataset_name] use_hf = strtobool(os.environ.get('USE_HF', 'False')) + dataset_str_f = 'Downloading the dataset from {hub}, dataset_id: {dataset_id}' if use_hf: dataset_id_or_path = dataset_info['hf_dataset_id'] + dataset_str = dataset_str_f.format( + hub='HuggingFace', dataset_id=dataset_id_or_path) else: dataset_id_or_path = dataset_info['dataset_id_or_path'] + dataset_str = dataset_str_f.format( + hub='ModelScope', dataset_id=dataset_id_or_path) + logger.info(dataset_str) assert dataset_id_or_path is not None, ( f'dataset_name: {dataset_name}, use_hf: {use_hf}, ' f'dataset_id_or_path: {dataset_id_or_path}.') diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 7d5b667233..90a24dc54b 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -1719,6 +1719,14 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, requires=['transformers>=4.37'], hf_model_id='Qwen/Qwen1.5-MoE-A2.7B-Chat') +@register_model( + ModelType.codeqwen1half_7b_chat, + 'qwen/CodeQwen1.5-7B-Chat', + LoRATM.qwen1half, + TemplateType.qwen, + support_flash_attn=True, + support_vllm=True, + requires=['transformers>=4.37']) def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], From 91f28f4373c0c236d70e944ece1baf3a51efcf0e Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 02:37:27 +0800 Subject: [PATCH 16/26] support awq gptq --- swift/llm/infer.py | 4 +- swift/llm/utils/argument.py | 24 +++--- swift/llm/utils/model.py | 164 ++++++++++++++++++++++-------------- swift/llm/utils/template.py | 6 ++ 4 files changed, 121 insertions(+), 77 deletions(-) diff --git a/swift/llm/infer.py b/swift/llm/infer.py index e34177a2ad..24f998b896 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -84,8 +84,8 @@ def merge_lora(args: InferArguments, logger.info(f'replace_if_exists: {replace_if_exists}') assert args.ckpt_dir is not None, 'args.ckpt_dir is not specified.' assert args.sft_type == 'lora', "Only supports sft_type == 'lora'" - assert 'int4' not in args.model_type, 'int4 model is not supported' - assert 'int8' not in args.model_type, 'int8 model is not supported' + for s in ['int4', 'int8', 'awq']: + assert s not in args.model_type, f'{s} model is not supported' if args.quantization_bit != 0: logger.warning('It is not recommended to merge quantized models, ' 'as this can result in performance degradation') diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index 72b0d57b2e..7e4bb2380c 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -118,18 +118,18 @@ def select_dtype( else: raise ValueError(f'args.dtype: {self.dtype}') # cuda, npu - if self.dtype == 'AUTO' and not is_torch_bf16_gpu_available(): - self.dtype = 'fp16' - if self.dtype == 'AUTO' and ('int4' in self.model_type - or 'int8' in self.model_type): - model_torch_dtype = MODEL_MAPPING[self.model_type]['torch_dtype'] - if model_torch_dtype is not None: - self.dtype = dtype_mapping[model_torch_dtype] if self.dtype == 'AUTO': - if isinstance(self, SftArguments): - self.dtype = 'bf16' + if is_torch_bf16_gpu_available(): + self.dtype = 'fp16' else: - return None, False, False + model_torch_dtype = MODEL_MAPPING[self.model_type].get( + 'torch_dtype') + if model_torch_dtype is not None: + self.dtype = dtype_mapping[model_torch_dtype] + elif isinstance(self, SftArguments): + self.dtype = 'bf16' + else: + return None, False, False torch_dtype = dtype_mapping_reversed[self.dtype] @@ -661,8 +661,8 @@ def __post_init__(self) -> None: assert len(self.additional_trainable_parameters) == 0, ( 'lora does not support `additional_trainable_parameters`, please set `--sft_type full`' ) - if 'int4' in self.model_type or 'int8' in self.model_type: - assert self.quantization_bit == 0, 'int4 and int8 models do not need to be quantized again.' + if 'int4' in self.model_type or 'int8' in self.model_type or 'awq' in self.model_type: + assert self.quantization_bit == 0, 'int4, int8 or awq models do not need to be quantized again.' if self.learning_rate is None: self.learning_rate = 1e-4 if self.save_only_model is None: diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 90a24dc54b..a0c4b87385 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -216,6 +216,9 @@ class ModelType: mixtral_moe_7b_instruct = 'mixtral-moe-7b-instruct' mixtral_moe_7b_aqlm_2bit_1x16 = 'mixtral-moe-7b-aqlm-2bit-1x16' # aqlm mixtral_moe_8x22b_v1 = 'mixtral-moe-8x22b-v1' + # wizardlm + wizardlm2_7b_awq = 'wizardlm2-7b-awq' + wizardlm2_8x22b = 'wizardlm2-8x22b' # baichuan baichuan_7b = 'baichuan-7b' baichuan_13b = 'baichuan-13b' @@ -413,6 +416,44 @@ def _register_model( return _register_model +def _check_awq_ext() -> None: + try: + from awq.utils.packing_utils import dequantize_gemm + import awq_ext # with CUDA kernels (AutoAWQ_kernels) + except ImportError as e: + raise ImportError( + 'You are training awq models, remember installing awq_ext by ' + '`git clone https://github.com/casper-hansen/AutoAWQ_kernels ' + '&& cd AutoAWQ_kernels && pip install -e .`') from e + + +def _check_gptq_model(bits: int, model_kwargs: Dict[str, Any]) -> None: + assert model_kwargs.get('quantization_config') is None + if version.parse(transformers.__version__) >= version.parse('4.35'): + model_kwargs['quantization_config'] = GPTQConfig( + bits=bits, use_exllama=False) + else: + model_kwargs['quantization_config'] = GPTQConfig( + bits=bits, disable_exllama=True) + + # fix quantlinear bug + from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear + __old_forward = QuantLinear.forward + + def _new_forward(self, x): + if not self.training or not self.autogptq_cuda_available: + return self.__old_forward(x) + # fix sft no grad + self.autogptq_cuda_available = False + res = self.__old_forward(x) + self.autogptq_cuda_available = True + return res + + if not hasattr(QuantLinear, '__old_forward'): # avoid double patching + QuantLinear.__old_forward = __old_forward + QuantLinear.forward = _new_forward + + @register_model( ModelType.internlm_20b, 'Shanghai_AI_Laboratory/internlm-20b', @@ -564,6 +605,13 @@ def get_model_tokenizer_from_repo(model_dir: str, automodel_class=AutoModelForCausalLM, **kwargs): """load from an independent repository""" + is_awq = kwargs.pop('is_awq', False) + gptq_bits = kwargs.pop('gptq_bits', 0) + is_training = kwargs.pop('is_training', False) + if is_awq and is_training: + _check_awq_ext() + if gptq_bits > 0 and is_training: + _check_gptq_model(gptq_bits, model_kwargs) if model_config is None: model_config = AutoConfig.from_pretrained( model_dir, trust_remote_code=True) @@ -585,6 +633,10 @@ def get_model_tokenizer_from_repo(model_dir: str, torch_dtype=torch_dtype, trust_remote_code=True, **model_kwargs) + if is_awq: + model.is_awq = is_awq + if gptq_bits > 0: + model.gptq_bits = gptq_bits return model, tokenizer @@ -1026,6 +1078,17 @@ def cross_entropy_forward(self, inputs: Tensor, return model, tokenizer +@register_model( + ModelType.wizardlm2_7b_awq, + 'AI-ModelScope/WizardLM-2-7B-AWQ', + LoRATM.llama2, + TemplateType.wizardlm2_awq, + requires=['transformers>=4.34'], + torch_dtype=torch.float16, + support_flash_attn=True, + support_vllm=True, + function_kwargs={'is_awq': True}, + hf_model_id='MaziyarPanahi/WizardLM-2-7B-AWQ') @register_model( ModelType.gemma_2b, 'AI-ModelScope/gemma-2b', @@ -1163,7 +1226,8 @@ def cross_entropy_forward(self, inputs: Tensor, TemplateType.default_generation, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/CodeQwen1.5-7B') @register_model( ModelType.qwen1half_moe_a2_7b, 'qwen/Qwen1.5-MoE-A2.7B', @@ -1577,6 +1641,7 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], + torch_dtype=torch.float16, hf_model_id='Qwen/Qwen1.5-0.5B-Chat-AWQ') @register_model( ModelType.qwen1half_1_8b_chat_awq, @@ -1587,6 +1652,7 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], + torch_dtype=torch.float16, hf_model_id='Qwen/Qwen1.5-1.8B-Chat-AWQ') @register_model( ModelType.qwen1half_4b_chat_awq, @@ -1597,6 +1663,7 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], + torch_dtype=torch.float16, hf_model_id='Qwen/Qwen1.5-4B-Chat-AWQ') @register_model( ModelType.qwen1half_7b_chat_awq, @@ -1607,6 +1674,7 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], + torch_dtype=torch.float16, hf_model_id='Qwen/Qwen1.5-7B-Chat-AWQ') @register_model( ModelType.qwen1half_14b_chat_awq, @@ -1617,6 +1685,7 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], + torch_dtype=torch.float16, hf_model_id='Qwen/Qwen1.5-14B-Chat-AWQ') @register_model( ModelType.qwen1half_32b_chat_awq, @@ -1627,6 +1696,7 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], + torch_dtype=torch.float16, hf_model_id='Qwen/Qwen1.5-32B-Chat-AWQ') @register_model( ModelType.qwen1half_72b_chat_awq, @@ -1637,6 +1707,7 @@ def get_model_tokenizer_aqlm(model_dir: str, support_vllm=True, function_kwargs={'is_awq': True}, requires=['transformers>=4.37', 'autoawq'], + torch_dtype=torch.float16, hf_model_id='Qwen/Qwen1.5-72B-Chat-AWQ') @register_model( ModelType.codeqwen1half_7b_chat_awq, @@ -1646,7 +1717,9 @@ def get_model_tokenizer_aqlm(model_dir: str, support_flash_attn=True, support_vllm=True, function_kwargs={'is_awq': True}, - requires=['transformers>=4.37', 'autoawq']) + requires=['transformers>=4.37', 'autoawq'], + torch_dtype=torch.float16, + hf_model_id='Qwen/CodeQwen1.5-7B-Chat-AWQ') @register_model( ModelType.qwen1half_0_5b_chat, 'qwen/Qwen1.5-0.5B-Chat', @@ -1726,24 +1799,14 @@ def get_model_tokenizer_aqlm(model_dir: str, TemplateType.qwen, support_flash_attn=True, support_vllm=True, - requires=['transformers>=4.37']) + requires=['transformers>=4.37'], + hf_model_id='Qwen/CodeQwen1.5-7B-Chat') def get_model_tokenizer_qwen1half(model_dir: str, torch_dtype: Dtype, model_kwargs: Dict[str, Any], load_model: bool = True, **kwargs): - is_awq = kwargs.pop('is_awq', False) - is_training = kwargs.pop('is_training', False) kwargs['eos_token'] = '<|im_end|>' - if is_awq and is_training: - try: - from awq.utils.packing_utils import dequantize_gemm - import awq_ext # with CUDA kernels (AutoAWQ_kernels) - except ImportError as e: - raise ImportError( - 'You are training awq models, remember installing awq_ext by ' - '`git clone https://github.com/casper-hansen/AutoAWQ_kernels ' - '&& cd AutoAWQ_kernels && pip install -e .`') from e return get_model_tokenizer_with_flash_attn(model_dir, torch_dtype, model_kwargs, load_model, **kwargs) @@ -1756,7 +1819,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4') @@ -1767,7 +1830,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8') @register_model( @@ -1777,7 +1840,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4') @@ -1788,7 +1851,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8') @register_model( @@ -1798,7 +1861,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen1.5-4B-Chat-GPTQ-Int4') @@ -1809,7 +1872,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen1.5-4B-Chat-GPTQ-Int8') @register_model( @@ -1819,7 +1882,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen1.5-7B-Chat-GPTQ-Int4') @@ -1830,7 +1893,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen1.5-7B-Chat-GPTQ-Int8') @register_model( @@ -1840,7 +1903,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen1.5-14B-Chat-GPTQ-Int4') @@ -1851,7 +1914,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen1.5-14B-Chat-GPTQ-Int8') @register_model( @@ -1861,7 +1924,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen1.5-32B-Chat-GPTQ-Int4') @@ -1872,7 +1935,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen1.5-72B-Chat-GPTQ-Int4') @@ -1883,7 +1946,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen1.5-72B-Chat-GPTQ-Int8') @register_model( @@ -1893,7 +1956,7 @@ def get_model_tokenizer_qwen1half(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5', 'transformers>=4.37'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, hf_model_id='Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4') def get_model_tokenizer_qwen1half_intx(model_dir: str, @@ -2648,7 +2711,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen-1_8B-Chat-Int8') @register_model( @@ -2658,7 +2721,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen-1_8B-Chat-Int4') @@ -2669,7 +2732,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen-72B-Chat-Int8') @register_model( @@ -2679,7 +2742,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen-72B-Chat-Int4') @@ -2690,7 +2753,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, tags=['financial'], @@ -2704,7 +2767,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, torch_dtype=torch.float16, function_kwargs={ 'get_qwen_function': get_model_tokenizer_qwen_vl, - 'bits': 4 + 'gptq_bits': 4 }, support_flash_attn=True, tags=['multi-modal', 'vision'], @@ -2716,7 +2779,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen-14B-Chat-Int8') @register_model( @@ -2726,7 +2789,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 8}, + function_kwargs={'gptq_bits': 8}, support_flash_attn=True, hf_model_id='Qwen/Qwen-7B-Chat-Int8') @register_model( @@ -2736,7 +2799,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen-14B-Chat-Int4') @@ -2747,7 +2810,7 @@ def get_model_tokenizer_qwen_audio(model_dir: str, TemplateType.qwen, requires=['auto_gptq>=0.5'], torch_dtype=torch.float16, - function_kwargs={'bits': 4}, + function_kwargs={'gptq_bits': 4}, support_flash_attn=True, support_vllm=True, hf_model_id='Qwen/Qwen-7B-Chat-Int4') @@ -2756,31 +2819,6 @@ def get_model_tokenizer_qwen_intx(model_dir: str, model_kwargs: Dict[str, Any], load_model: bool = True, **kwargs): - logger.info('use gptq, ignore bnb arguments') - bits = kwargs.pop('bits') - if version.parse(transformers.__version__) >= version.parse('4.35'): - model_kwargs['quantization_config'] = GPTQConfig( - bits=bits, use_exllama=False) - else: - model_kwargs['quantization_config'] = GPTQConfig( - bits=bits, disable_exllama=True) - - # fix quantlinear bug - from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear - __old_forward = QuantLinear.forward - - def _new_forward(self, x): - if not self.training or not self.autogptq_cuda_available: - return self.__old_forward(x) - # fix sft no grad - self.autogptq_cuda_available = False - res = self.__old_forward(x) - self.autogptq_cuda_available = True - return res - - if not hasattr(QuantLinear, '__old_forward'): # avoid double patching - QuantLinear.__old_forward = __old_forward - QuantLinear.forward = _new_forward get_qwen_function = kwargs.pop('get_qwen_function', get_model_tokenizer_qwen_chat) model, tokenizer = get_qwen_function(model_dir, torch_dtype, model_kwargs, diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 1c7a608636..8d03bbc3f8 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -61,6 +61,7 @@ class TemplateType: minicpm_v = 'minicpm-v' gemma = 'gemma' mplug_owl2 = 'mplug-owl2' + wizardlm2_awq = 'wizardlm2-awq' # compatibility. (Deprecated) chatml = 'chatml' telechat = 'telechat' @@ -1363,6 +1364,11 @@ def data_collator(self, use_model=True, lazy_tokenize=True) +register_template( + TemplateType.wizardlm2_awq, + Template(['{{SYSTEM}}'], ['User:\n{{QUERY}}\n\nAssistant:\n'], ['\n\n'], + [''])) + def get_template( template_type: str, From e451533f7dd153bbe02a3af04fd16d323898755b Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 03:22:33 +0800 Subject: [PATCH 17/26] support yi-awq, yi-gptq --- swift/llm/utils/model.py | 58 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index a0c4b87385..adb6be39c1 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -131,11 +131,15 @@ class ModelType: yi_6b = 'yi-6b' yi_6b_200k = 'yi-6b-200k' yi_6b_chat = 'yi-6b-chat' + yi_6b_chat_awq = 'yi-6b-chat-awq' + yi_6b_chat_int8 = 'yi-6b-chat-int8' yi_9b = 'yi-9b' yi_9b_200k = 'yi-9b-200k' yi_34b = 'yi-34b' yi_34b_200k = 'yi-34b-200k' yi_34b_chat = 'yi-34b-chat' + yi_34b_chat_awq = 'yi-34b-chat-awq' + yi_34b_chat_int8 = 'yi-34b-chat-int8' # yi-vl yi_vl_6b_chat = 'yi-vl-6b-chat' yi_vl_34b_chat = 'yi-vl-34b-chat' @@ -633,9 +637,9 @@ def get_model_tokenizer_from_repo(model_dir: str, torch_dtype=torch_dtype, trust_remote_code=True, **model_kwargs) - if is_awq: + if load_model and is_awq: model.is_awq = is_awq - if gptq_bits > 0: + if load_model and gptq_bits > 0: model.gptq_bits = gptq_bits return model, tokenizer @@ -1369,6 +1373,30 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True, hf_model_id='01-ai/Yi-6B-Chat') +@register_model( + ModelType.yi_6b_chat_awq, + '01ai/Yi-6B-Chat-4bits', + LoRATM.llama2, + TemplateType.yi, + eos_token='<|im_end|>', + requires=['autoawq'], + torch_dtype=torch.float16, + function_kwargs={'is_awq': True}, + support_flash_attn=True, + support_vllm=True, + hf_model_id='01-ai/Yi-6B-Chat-4bits') +@register_model( + ModelType.yi_6b_chat_int8, + '01ai/Yi-6B-Chat-8bits', + LoRATM.llama2, + TemplateType.yi, + eos_token='<|im_end|>', + requires=['auto_gptq'], + torch_dtype=torch.float16, + function_kwargs={'gptq_bits': 8}, + support_flash_attn=True, + support_vllm=True, + hf_model_id='01-ai/Yi-6B-Chat-8bits') @register_model( ModelType.yi_34b_chat, '01ai/Yi-34B-Chat', @@ -1378,6 +1406,30 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True, hf_model_id='01-ai/Yi-34B-Chat') +@register_model( + ModelType.yi_34b_chat_awq, + '01ai/Yi-34B-Chat-4bits', + LoRATM.llama2, + TemplateType.yi, + eos_token='<|im_end|>', + requires=['autoawq'], + torch_dtype=torch.float16, + function_kwargs={'is_awq': True}, + support_flash_attn=True, + support_vllm=True, + hf_model_id='01-ai/Yi-34B-Chat-4bits') +@register_model( + ModelType.yi_34b_chat_int8, + '01ai/Yi-34B-Chat-8bits', + LoRATM.llama2, + TemplateType.yi, + eos_token='<|im_end|>', + requires=['auto_gptq'], + torch_dtype=torch.float16, + function_kwargs={'gptq_bits': 8}, + support_flash_attn=True, + support_vllm=True, + hf_model_id='01-ai/Yi-34B-Chat-8bits') @register_model( ModelType.yi_34b_200k, '01ai/Yi-34B-200K', @@ -2925,7 +2977,7 @@ def get_model_tokenizer_telechat(model_dir: str, **kwargs): if torch_dtype == torch.bfloat16: logger.info( - 'telechat-7b does not support the bfl16 dtype; the dtype is converted to fp16.' + 'telechat-7b does not support the bf16 dtype; the dtype is converted to fp16.' ) torch_dtype = torch.float16 model_config = AutoConfig.from_pretrained( From 7258e11b317eb30d40c82f4896c16ffb7d797243 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 10:59:32 +0800 Subject: [PATCH 18/26] support wizardlm2_8x22b zero3-offload docs --- ...37\346\200\201\345\205\274\345\256\271.md" | 27 ++++++++ swift/llm/ds_config/zero3_offload.json | 62 +++++++++++++++++++ swift/llm/utils/argument.py | 18 +++--- swift/llm/utils/model.py | 9 +++ swift/llm/utils/template.py | 10 +++ 5 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 "docs/source/LLM/HuggingFace\347\224\237\346\200\201\345\205\274\345\256\271.md" create mode 100644 swift/llm/ds_config/zero3_offload.json diff --git "a/docs/source/LLM/HuggingFace\347\224\237\346\200\201\345\205\274\345\256\271.md" "b/docs/source/LLM/HuggingFace\347\224\237\346\200\201\345\205\274\345\256\271.md" new file mode 100644 index 0000000000..6c184361b0 --- /dev/null +++ "b/docs/source/LLM/HuggingFace\347\224\237\346\200\201\345\205\274\345\256\271.md" @@ -0,0 +1,27 @@ +# HuggingFace生态兼容 +默认我们会使用[ModelScope](https://modelscope.cn/my/overview)中的模型和数据集进行微调和推理。但是考虑到海外用户更熟悉[HuggingFace](https://huggingface.co/)生态,这里对其进行兼容。 + +你需要设置环境变量`USE_HF=1`,支持的HuggingFace模型和数据集可以参考[支持的模型和数据集](支持的模型和数据集.md),部分数据集只支持在ModelScope环境下使用。 + +以下是对`qwen1.5-7b-chat`的推理脚本: +```shell +# Experimental Environment: A10, 3090, V100 +USE_HF=1 CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat +``` + +微调脚本: +```shell +# Experimental Environment: 2 * A100 +# GPU Memory Requirement: 2 * 30GB +USE_HF=1 \ +NPROC_PER_NODE=2 \ +CUDA_VISIBLE_DEVICES=0,1 \ +swift sft \ + --model_type qwen1half-7b-chat \ + --dataset blossom-math-zh \ + --num_train_epochs 5 \ + --sft_type lora \ + --output_dir output \ +``` + +微调后推理与部署等内容参考其他文档. diff --git a/swift/llm/ds_config/zero3_offload.json b/swift/llm/ds_config/zero3_offload.json new file mode 100644 index 0000000000..8374e5b0d3 --- /dev/null +++ b/swift/llm/ds_config/zero3_offload.json @@ -0,0 +1,62 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "bf16": { + "enabled": "auto" + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "total_num_steps": "auto", + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index 7e4bb2380c..208f015619 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -400,7 +400,7 @@ class SftArguments(ArgumentsBase): neftune_backend: Literal['swift', 'transformers'] = None gradient_checkpointing: Optional[bool] = None - # e.g. 'default-zero3', 'default-zero2', 'ds_config/zero2.json' + # e.g. 'default-zero3', 'default-zero2', 'ds_config/zero2.json', 'zero3-offload' deepspeed: Optional[str] = None batch_size: int = 1 eval_batch_size: Optional[int] = None @@ -587,12 +587,16 @@ def __post_init__(self) -> None: if is_pai_training_job(): self._handle_pai_compat() ds_config_folder = os.path.join(__file__, '..', '..', 'ds_config') - if self.deepspeed == 'default-zero2': - self.deepspeed = os.path.abspath( - os.path.join(ds_config_folder, 'zero2.json')) - elif self.deepspeed == 'default-zero3': - self.deepspeed = os.path.abspath( - os.path.join(ds_config_folder, 'zero3.json')) + deepspeed_mapping = { + 'default-zero2': 'zero2.json', + 'default-zero3': 'zero3.json', + 'zero3-offload': 'zero3-offload.json' + } + for ds_name, ds_config in deepspeed_mapping.items(): + if self.deepspeed == ds_name: + self.deepspeed = os.path.abspath( + os.path.join(ds_config_folder, ds_config)) + break self.handle_path() self.set_model_type() diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index adb6be39c1..2d523d9640 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -1082,6 +1082,15 @@ def cross_entropy_forward(self, inputs: Tensor, return model, tokenizer +@register_model( + ModelType.wizardlm2_8x22b, + 'AI-ModelScope/WizardLM-2-8x22B', + LoRATM.llama2, + TemplateType.wizardlm2, + requires=['transformers>=4.36'], + support_flash_attn=True, + support_vllm=True, + hf_model_id='alpindale/WizardLM-2-8x22B') @register_model( ModelType.wizardlm2_7b_awq, 'AI-ModelScope/WizardLM-2-7B-AWQ', diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 8d03bbc3f8..43503b74ec 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -62,6 +62,7 @@ class TemplateType: gemma = 'gemma' mplug_owl2 = 'mplug-owl2' wizardlm2_awq = 'wizardlm2-awq' + wizardlm2 = 'wizardlm2' # compatibility. (Deprecated) chatml = 'chatml' telechat = 'telechat' @@ -1369,6 +1370,15 @@ def data_collator(self, Template(['{{SYSTEM}}'], ['User:\n{{QUERY}}\n\nAssistant:\n'], ['\n\n'], [''])) +_wizardlm2_system = ( + 'A chat between a curious user and an artificial intelligence assistant. ' + 'The assistant gives helpful, detailed, and polite answers to the user\'s questions. ' +) +register_template( + TemplateType.wizardlm2, + Template(['{{SYSTEM}}'], ['USER: {{QUERY}} ASSISTANT:'], [''], + [''], _wizardlm2_system)) + def get_template( template_type: str, From 2c1b466e2da6475fdeb0d827bc8e008107dc0a65 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 11:41:46 +0800 Subject: [PATCH 19/26] update readme --- README.md | 4 ++-- README_CN.md | 3 ++- docs/source/LLM/index.md | 1 + docs/source_en/LLM/Compat-HF.md | 27 +++++++++++++++++++++++++++ docs/source_en/LLM/index.md | 1 + 5 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 docs/source_en/LLM/Compat-HF.md diff --git a/README.md b/README.md index baf34b96c2..75fd44b1b7 100644 --- a/README.md +++ b/README.md @@ -390,7 +390,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | XVerse | [XVerse series models](https://github.com/xverse-ai) | Chinese
English | 7B-65B | base model
chat model
long text model
MoE model | | LLaMA2 | [LLaMA2 series models](https://github.com/facebookresearch/llama) | English | 7B-70B
including quantized versions | base model
chat model | | Mistral
Mixtral | [Mistral series models](https://github.com/mistralai/mistral-src) | English | 7B-22B | base model
instruct model
MoE model | -| YI | [01AI's YI series models](https://github.com/01-ai) | Chinese
English | 6B-34B | base model
chat model
long text model | +| YI | [01AI's YI series models](https://github.com/01-ai) | Chinese
English | 6B-34B
including quantized | base model
chat model
long text model | | InternLM
InternLM2
InternLM2-Math | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese
English | 1.8B-20B | base model
chat model
math model | | DeepSeek
DeepSeek-MoE
DeepSeek-Coder
DeepSeek-Math | [DeepSeek series models](https://github.com/deepseek-ai) | Chinese
English | 1.3B-67B | base model
chat model
MoE model
code model
math model | | MAMBA | [MAMBA temporal convolution model](https://github.com/state-spaces/mamba) | English | 130M-2.8B | base model | @@ -413,7 +413,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model
chat model | | mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese
English | 13B | base model | | c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model | - +| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B | chat model
MoE model | #### MLLMs diff --git a/README_CN.md b/README_CN.md index 83ce16dc5b..b4ffd93d55 100644 --- a/README_CN.md +++ b/README_CN.md @@ -388,7 +388,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | XVerse | [元象系列模型](https://github.com/xverse-ai) | 中文
英文 | 7B-65B | base模型
chat模型
长文本模型
MoE模型 | | | LLaMA2 | [LLaMA2系列模型](https://github.com/facebookresearch/llama) | 英文 | 7B-70B
包含量化版本 | base模型
chat模型 | | Mistral
Mixtral | [Mistral系列模型](https://github.com/mistralai/mistral-src) | 英文 | 7B-8x22B | base模型
instruct模型
MoE模型 | -| YI | [01AI的YI系列模型](https://github.com/01-ai) | 中文
英文 | 6B-34B | base模型
chat模型
长文本模型 | +| YI | [01AI的YI系列模型](https://github.com/01-ai) | 中文
英文 | 6B-34B
包含量化版本 | base模型
chat模型
长文本模型 | | InternLM
InternLM2
InternLM2-Math | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文
英文 | 1.8B-20B | base模型
chat模型
数学模型 | | DeepSeek
DeepSeek-MoE
DeepSeek-Coder
DeepSeek-Math | [幻方系列模型](https://github.com/deepseek-ai) | 中文
英文 | 1.3B-67B | base模型
chat模型
MoE模型
代码模型
数学模型 | | MAMBA | [MAMBA时序卷积模型](https://github.com/state-spaces/mamba) | 英文 | 130M-2.8B | base模型 | @@ -411,6 +411,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | dbrx | [databricks](https://github.com/databricks/dbrx) | 英文 | 132B | base模型
chat模型 | | mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | 中文
英文 | 13B | base模型 | | c4ai-command-r | [c4ai](https://cohere.com/command) | 多语种 | 35B-104B | chat模型 | +| WizardLM2 | [WizardLM2系列模型](https://github.com/nlpxucan/WizardLM) | 多语种 | 7B-8x22B | chat模型
MoE模型 | #### 多模态大模型 diff --git a/docs/source/LLM/index.md b/docs/source/LLM/index.md index e87d53d817..8841854d8a 100644 --- a/docs/source/LLM/index.md +++ b/docs/source/LLM/index.md @@ -25,3 +25,4 @@ 2. [微调推理的命令行参数](命令行参数.md) 3. [支持的模型和数据集列表](支持的模型和数据集.md) 4. [运行速度与显存的Benchmark](Benchmark.md) +5. [HuggingFace生态兼容](HuggingFace生态兼容.md) diff --git a/docs/source_en/LLM/Compat-HF.md b/docs/source_en/LLM/Compat-HF.md new file mode 100644 index 0000000000..bbc8da4534 --- /dev/null +++ b/docs/source_en/LLM/Compat-HF.md @@ -0,0 +1,27 @@ +# HuggingFace Eco-compatibility +By default, we use models and datasets from [ModelScope](https://modelscope.cn/my/overview) for fine-tuning and inference. However, considering that overseas users are more familiar with the [HuggingFace](https://huggingface.co/) ecosystem, we have made it compatible with HuggingFace. + +To enable HuggingFace compatibility, you need to set the environment variable `USE_HF=1`. Supported HuggingFace models and datasets can be found in the [Supported Models and Datasets](Supported-models-datasets.md). Note that some datasets are only supported in the ModelScope environment. + +Here is an example inference script for qwen1.5-7b-chat: +```shell +# Experimental Environment: A10, 3090, V100 +USE_HF=1 CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat +``` + +微调脚本: +```shell +# Experimental Environment: 2 * A100 +# GPU Memory Requirement: 2 * 30GB +USE_HF=1 \ +NPROC_PER_NODE=2 \ +CUDA_VISIBLE_DEVICES=0,1 \ +swift sft \ + --model_type qwen1half-7b-chat \ + --dataset blossom-math-zh \ + --num_train_epochs 5 \ + --sft_type lora \ + --output_dir output \ +``` + +Please refer to other documents for inference after fine-tuning, and deployment . diff --git a/docs/source_en/LLM/index.md b/docs/source_en/LLM/index.md index 0a0a79d869..057a29cebc 100644 --- a/docs/source_en/LLM/index.md +++ b/docs/source_en/LLM/index.md @@ -26,3 +26,4 @@ Please check: [Multi-Modal Best Practices](../Multi-Modal/index.md) 2. [Command Line Parameters](Command-line-parameters.md) 3. [Supported models and datasets](Supported-models-datasets.md) 4. [Benchmark](Benchmark.md) +5. [Compatible with the HuggingFace ecosystem](Compat-HF.md) From 836066e0b416fdaaad42728f30386e6b25a70265 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 11:49:29 +0800 Subject: [PATCH 20/26] update readme --- README.md | 2 +- README_CN.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 75fd44b1b7..cecfe652e6 100644 --- a/README.md +++ b/README.md @@ -413,7 +413,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | dbrx | [databricks](https://github.com/databricks/dbrx) | English | 132B | base model
chat model | | mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | Chinese
English | 13B | base model | | c4ai-command-r | [c4ai](https://cohere.com/command) | Multilingual | 35B-104B | chat model | -| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B | chat model
MoE model | +| WizardLM2 | [WizardLM2 series models](https://github.com/nlpxucan/WizardLM) | English | 7B-8x22B
including quantized versions | chat model
MoE model | #### MLLMs diff --git a/README_CN.md b/README_CN.md index b4ffd93d55..ef720c9267 100644 --- a/README_CN.md +++ b/README_CN.md @@ -411,7 +411,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | dbrx | [databricks](https://github.com/databricks/dbrx) | 英文 | 132B | base模型
chat模型 | | mengzi3 | [Langboat](https://github.com/Langboat/Mengzi3) | 中文
英文 | 13B | base模型 | | c4ai-command-r | [c4ai](https://cohere.com/command) | 多语种 | 35B-104B | chat模型 | -| WizardLM2 | [WizardLM2系列模型](https://github.com/nlpxucan/WizardLM) | 多语种 | 7B-8x22B | chat模型
MoE模型 | +| WizardLM2 | [WizardLM2系列模型](https://github.com/nlpxucan/WizardLM) | 多语种 | 7B-8x22B
包含量化版本 | chat模型
MoE模型 | #### 多模态大模型 From 2972df92f18a309f63a2dce059f17887d158bad5 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 13:39:59 +0800 Subject: [PATCH 21/26] update --- swift/llm/utils/argument.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index 208f015619..11c0c7dd25 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -590,7 +590,7 @@ def __post_init__(self) -> None: deepspeed_mapping = { 'default-zero2': 'zero2.json', 'default-zero3': 'zero3.json', - 'zero3-offload': 'zero3-offload.json' + 'zero3-offload': 'zero3_offload.json' } for ds_name, ds_config in deepspeed_mapping.items(): if self.deepspeed == ds_name: From 1ad7e3613940b9afeec89b985ba1825e75eb0db0 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 13:44:42 +0800 Subject: [PATCH 22/26] update readme --- README.md | 17 +++++++++++++++++ README_CN.md | 16 ++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/README.md b/README.md index cecfe652e6..7e565f99e9 100644 --- a/README.md +++ b/README.md @@ -318,6 +318,23 @@ swift sft \ --deepspeed default-zero3 \ ``` +ZeRO3-Offload: +```shell +# Experimental Environment: 4 * A100 +# GPU Memory Requirement: 4 * 12GB +# Runtime: 60 hours +NPROC_PER_NODE=4 \ +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +swift sft \ + --model_id_or_path AI-ModelScope/WizardLM-2-8x22B \ + --dataset blossom-math-zh \ + --num_train_epochs 5 \ + --sft_type lora \ + --output_dir output \ + --deepspeed zero3-offload \ +``` + + ### Inference Original model: ```shell diff --git a/README_CN.md b/README_CN.md index ef720c9267..1c0c7abb5c 100644 --- a/README_CN.md +++ b/README_CN.md @@ -316,6 +316,22 @@ swift sft \ --deepspeed default-zero3 \ ``` +ZeRO3-Offload: +```shell +# 实验环境: 4 * A100 +# 显存需求: 4 * 12GB +# 运行时长: 60小时 +NPROC_PER_NODE=4 \ +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +swift sft \ + --model_id_or_path AI-ModelScope/WizardLM-2-8x22B \ + --dataset blossom-math-zh \ + --num_train_epochs 5 \ + --sft_type lora \ + --output_dir output \ + --deepspeed zero3-offload \ +``` + ### 推理 原始模型: ```shell From 15abebae3485692034170ef86fc393b298caf8b0 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 13:57:01 +0800 Subject: [PATCH 23/26] update readme --- README.md | 2 ++ README_CN.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 7e565f99e9..6eb9a768b1 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,8 @@ To facilitate use by users unfamiliar with deep learning, we provide a Gradio we Additionally, we are expanding capabilities for other modalities. Currently, we support full-parameter training and LoRA training for AnimateDiff. ## 🎉 News +- 2024.04.18: Supported models: wizardlm2-7b-awq, wizardlm2-8x22b, yi-6b-chat-awq, yi-6b-chat-int8, yi-34b-chat-awq, yi-34b-chat-int8. Supported `--deepspeed zero3-offload` and provided default zero3-offload configuration file for zero3+cpu offload usage. +- 2024.04.18: Supported compatibility with HuggingFace ecosystem using the environment variable `USE_HF`, switching to use models and datasets from HF. Please refer to the [HuggingFace ecosystem compatibility documentation](https://github.com/modelscope/swift/tree/main/docs/source_en/LLM/Compat-HF.md). - 2024.04.17: Support the evaluation for OpenAI standard interfaces. Check the [parameter documentation](docs/source_en/LLM/Command-line-parameters.md#eval-parameters) for details. - 🔥2024.04.17: Support **CodeQwen1.5-7B** series: CodeQwen1.5-7B, CodeQwen1.5-7B-Chat,CodeQwen1.5-7B-Chat-AWQ, use [this script](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/codeqwen1half_7b_chat/lora/sft.sh) to train. - 2024.04.16: Supports inference and fine-tuning of llava-v1.6-34b model. For best practice, you can refer to [here](https://github.com/modelscope/swift/tree/main/docs/source_en/Multi-Modal/llava-best-practice.md). diff --git a/README_CN.md b/README_CN.md index 1c0c7abb5c..3e1891b33b 100644 --- a/README_CN.md +++ b/README_CN.md @@ -40,6 +40,8 @@ SWIFT支持近**200种LLM和MLLM**(多模态大模型)的训练、推理、 此外,我们也在拓展其他模态的能力,目前我们支持了AnimateDiff的全参数训练和LoRA训练。 ## 🎉 新闻 +- 2024.04.18: 支持模型: wizardlm2-7b-awq, wizardlm2-8x22b, yi-6b-chat-awq, yi-6b-chat-int8, yi-34b-chat-awq, yi-34b-chat-int8. 支持`--deepspeed zero3-offload`, 提供了默认zero3-offload配置文件来使用zero3+cpu offload. +- 2024.04.18: 支持使用环境变量`USE_HF`兼容HuggingFace生态, 切换成使用HF中的模型和数据集, 可以查看[HuggingFace生态兼容文档](https://github.com/modelscope/swift/tree/main/docs/source/LLM/HuggingFace生态兼容.md). - 2024.04.17: 支持OpenAI样式的接口评测, 可以查看[评测参数接口文档](docs/source/LLM/命令行参数.md#eval参数)来查看使用方法. - 🔥2024.04.17: 支持 **CodeQwen1.5-7B**系列: CodeQwen1.5-7B, CodeQwen1.5-7B-Chat,CodeQwen1.5-7B-Chat-AWQ, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/codeqwen1half_7b_chat/lora/sft.sh)来开始训练! - 2024.04.16: 支持llava-v1.6-34b的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/llava最佳实践.md). From bd3a4a25bf8f885b6c7ac239a5c7ab33b295c89c Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 14:20:32 +0800 Subject: [PATCH 24/26] update readme --- docs/source_en/LLM/Compat-HF.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source_en/LLM/Compat-HF.md b/docs/source_en/LLM/Compat-HF.md index bbc8da4534..14c2a8a073 100644 --- a/docs/source_en/LLM/Compat-HF.md +++ b/docs/source_en/LLM/Compat-HF.md @@ -9,7 +9,7 @@ Here is an example inference script for qwen1.5-7b-chat: USE_HF=1 CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat ``` -微调脚本: +Fine-tuning script: ```shell # Experimental Environment: 2 * A100 # GPU Memory Requirement: 2 * 30GB From 418f8d696be1da441bc61a04890d9b1f33486984 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 15:06:30 +0800 Subject: [PATCH 25/26] better aqlm --- swift/llm/utils/model.py | 68 ++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 2d523d9640..a37d2307da 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -610,12 +610,20 @@ def get_model_tokenizer_from_repo(model_dir: str, **kwargs): """load from an independent repository""" is_awq = kwargs.pop('is_awq', False) + is_aqlm = kwargs.pop('is_aqlm', False) gptq_bits = kwargs.pop('gptq_bits', 0) is_training = kwargs.pop('is_training', False) if is_awq and is_training: _check_awq_ext() if gptq_bits > 0 and is_training: _check_gptq_model(gptq_bits, model_kwargs) + context = kwargs.get('context', None) + if is_aqlm and is_training: + require_version('transformers>=4.39') + import aqlm + context = aqlm.optimize_for_training() + if context is None: + context = nullcontext() if model_config is None: model_config = AutoConfig.from_pretrained( model_dir, trust_remote_code=True) @@ -628,7 +636,6 @@ def get_model_tokenizer_from_repo(model_dir: str, if eos_token is not None: tokenizer.eos_token = eos_token model = None - context = kwargs.get('context', nullcontext()) if load_model: with context: model = automodel_class.from_pretrained( @@ -1657,42 +1664,6 @@ def get_model_tokenizer_with_flash_attn(model_dir: str, **kwargs) -@register_model( - ModelType.llama2_7b_aqlm_2bit_1x16, - 'AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf', - LoRATM.llama2, - TemplateType.default_generation_bos, - ignore_file_pattern=[r'.+\.bin$'], - support_flash_attn=True, - requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'], - support_vllm=False, - hf_model_id='ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf') -@register_model( - ModelType.mixtral_moe_7b_aqlm_2bit_1x16, - 'AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf', - LoRATM.llama2, - TemplateType.default_generation_bos, - requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'], - support_flash_attn=True, - support_vllm=False, - support_gradient_checkpointing=False, - hf_model_id='ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf') -def get_model_tokenizer_aqlm(model_dir: str, - torch_dtype: Dtype, - model_kwargs: Dict[str, Any], - load_model: bool = True, - **kwargs): - import aqlm - context = aqlm.optimize_for_training() - return get_model_tokenizer_llama2( - model_dir, - torch_dtype, - model_kwargs, - load_model, - context=context, - **kwargs) - - @register_model( ModelType.qwen1half_0_5b_chat_awq, 'qwen/Qwen1.5-0.5B-Chat-AWQ', @@ -2358,6 +2329,29 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, return model, tokenizer + +@register_model( + ModelType.llama2_7b_aqlm_2bit_1x16, + 'AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf', + LoRATM.llama2, + TemplateType.default_generation_bos, + ignore_file_pattern=[r'.+\.bin$'], + support_flash_attn=True, + requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'], + support_vllm=False, + function_kwargs={'is_aqlm': True}, + hf_model_id='ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf') +@register_model( + ModelType.mixtral_moe_7b_aqlm_2bit_1x16, + 'AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf', + LoRATM.llama2, + TemplateType.default_generation_bos, + requires=['transformers>=4.38', 'aqlm', 'torch>=2.2.0'], + support_flash_attn=True, + support_vllm=False, + support_gradient_checkpointing=False, + function_kwargs={'is_aqlm': True}, + hf_model_id='ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf') @register_model( ModelType.llama2_7b, 'modelscope/Llama-2-7b-ms', From 261d5c024995e5ddb700bcb04583465fa8498315 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Thu, 18 Apr 2024 15:08:55 +0800 Subject: [PATCH 26/26] lint pass --- swift/llm/utils/model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index a37d2307da..bd6158b526 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -2329,7 +2329,6 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, return model, tokenizer - @register_model( ModelType.llama2_7b_aqlm_2bit_1x16, 'AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf',