diff --git a/.gitignore b/.gitignore index 9b56a91519..084ee30b84 100644 --- a/.gitignore +++ b/.gitignore @@ -109,6 +109,7 @@ venv.bak/ .vscode .idea +.run # custom *.pkl diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index a374d272a4..e7c37f6e6b 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -334,6 +334,7 @@ |mistral-nemo-base-2407|[AI-ModelScope/Mistral-Nemo-Base-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Base-2407/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Nemo-Base-2407](https://huggingface.co/mistralai/Mistral-Nemo-Base-2407)| |mistral-nemo-instruct-2407|[AI-ModelScope/Mistral-Nemo-Instruct-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Instruct-2407/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)| |mistral-large-instruct-2407|[LLM-Research/Mistral-Large-Instruct-2407](https://modelscope.cn/models/LLM-Research/Mistral-Large-Instruct-2407/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| +|mistral-small-instruct-2409|[AI-ModelScope/Mistral-Small-Instruct-2409](https://modelscope.cn/models/AI-ModelScope/Mistral-Small-Instruct-2409/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Small-Instruct-2409](https://huggingface.co/mistralai/Mistral-Small-Instruct-2409)| |mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✘|✘|transformers>=4.36|moe|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)| |mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|✘|✘|transformers>=4.36|moe|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| |mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation|✔|✘|✘|✘|transformers>=4.38, aqlm, torch>=2.2.0|moe|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)| diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index 4a5b53facf..e499219f98 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -334,6 +334,7 @@ The table below introcudes all models supported by SWIFT: |mistral-nemo-base-2407|[AI-ModelScope/Mistral-Nemo-Base-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Base-2407/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Nemo-Base-2407](https://huggingface.co/mistralai/Mistral-Nemo-Base-2407)| |mistral-nemo-instruct-2407|[AI-ModelScope/Mistral-Nemo-Instruct-2407](https://modelscope.cn/models/AI-ModelScope/Mistral-Nemo-Instruct-2407/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)| |mistral-large-instruct-2407|[LLM-Research/Mistral-Large-Instruct-2407](https://modelscope.cn/models/LLM-Research/Mistral-Large-Instruct-2407/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| +|mistral-small-instruct-2409|[AI-ModelScope/Mistral-Small-Instruct-2409](https://modelscope.cn/models/AI-ModelScope/Mistral-Small-Instruct-2409/summary)|q_proj, k_proj, v_proj|mistral-nemo|✔|✔|✘|✘|transformers>=4.43|-|[mistralai/Mistral-Small-Instruct-2409](https://huggingface.co/mistralai/Mistral-Small-Instruct-2409)| |mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✘|✘|transformers>=4.36|moe|[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)| |mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|✘|✘|transformers>=4.36|moe|[mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| |mixtral-moe-7b-aqlm-2bit-1x16|[AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation|✔|✘|✘|✘|transformers>=4.38, aqlm, torch>=2.2.0|moe|[ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf)| diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index c016f2a118..bc9fa9f749 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -1435,7 +1435,7 @@ def preprocess(row): 'swift/TextCaps', [], preprocess_func=preprocess_text_caps, get_function=get_dataset_from_repo, - split=['train', 'val'], + split=['train', 'validation'], hf_dataset_id='HuggingFaceM4/TextCaps', huge_dataset=True, tags=['multi-modal', 'en', 'caption', 'quality']) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 4831d2b65e..834ea21456 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -484,6 +484,7 @@ class ModelType: mistral_nemo_base_2407 = 'mistral-nemo-base-2407' mistral_nemo_instruct_2407 = 'mistral-nemo-instruct-2407' mistral_large_instruct_2407 = 'mistral-large-instruct-2407' + mistral_small_instruct_2409 = 'mistral-small-instruct-2409' mixtral_moe_7b = 'mixtral-moe-7b' mixtral_moe_7b_instruct = 'mixtral-moe-7b-instruct' mixtral_moe_7b_aqlm_2bit_1x16 = 'mixtral-moe-7b-aqlm-2bit-1x16' # aqlm @@ -2623,6 +2624,16 @@ def get_model_tokenizer_glm4v(model_dir: str, support_flash_attn=True, support_vllm=True, hf_model_id='mistralai/Mistral-Large-Instruct-2407') +@register_model( + ModelType.mistral_small_instruct_2409, + 'AI-ModelScope/Mistral-Small-Instruct-2409', + LoRATM.llama, + TemplateType.mistral_nemo, + requires=['transformers>=4.43'], + ignore_file_pattern=['^consolidated'], + support_flash_attn=True, + support_vllm=True, + hf_model_id='mistralai/Mistral-Small-Instruct-2409') @register_model( ModelType.mistral_nemo_instruct_2407, 'AI-ModelScope/Mistral-Nemo-Instruct-2407',