diff --git a/README.md b/README.md index e05aec137d..7871ca3c9b 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,8 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用 ## 🎉 News -- 2023.1.4: Support for **VLLM deployment**, compatible with the **OpenAI API** style. For more details, please refer to [VLLM Inference Acceleration and Deployment](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署) +- 🔥2023.1.12: Support **deepseek-moe** series: deepseek-moe-16b, [deepseek-moe-16b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/deepseek_moe_16b_chat). +- 🔥2023.1.4: Support for **VLLM deployment**, compatible with the **OpenAI API** style. For more details, please refer to [VLLM Inference Acceleration and Deployment](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署) - 2023.1.4: Update [Benchmark](https://github.com/modelscope/swift/blob/main/docs/source/LLM/Benchmark.md) to facilitate viewing the training speed and GPU memory required for different models. - 🔥 2023.12.29: Support web-ui for training and inference, use `swift web-ui` after the installation of ms-swift. - 🔥 2023.12.29: Support DPO RLHF(Reinforcement Learning from Human Feedback) and two datasets: AI-ModelScope/stack-exchange-paired and AI-ModelScope/hh-rlhf for this task. Check [this documentation](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E4%BA%BA%E7%B1%BB%E5%AF%B9%E9%BD%90%E8%AE%AD%E7%BB%83%E6%96%87%E6%A1%A3.md) to start training! @@ -70,8 +71,8 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用 - 2023.12.23: Support [codegeex2-6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/codegeex2_6b). - 2023.12.19: Support [phi2-3b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/phi2_3b). - 2023.12.18: Support for VLLM for inference acceleration. -- 2023.12.15: Support deepseek, deepseek-coder series: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-chat, deepseek-coder-6_7b, deepseek-coder-6_7b-chat, deepseek-coder-33b, deepseek-coder-33b-chat. -- 2023.12.13: Support mistral-7b-chat-v2, [mixtral-7b-moe](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe), [mixtral-7b-moe-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe_chat). +- 2023.12.15: Support deepseek, deepseek-coder series: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-instruct, deepseek-coder-6_7b, deepseek-coder-6_7b-instruct, deepseek-coder-33b, deepseek-coder-33b-instruct. +- 2023.12.13: Support mistral-7b-instruct-v2, [mixtral-moe-7b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_moe_7b), [mixtral-moe-7b-instruct](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_moe_7b_instruct). - 2023.12.9: Support the `freeze_parameters` parameter as a compromise between LoRA and full parameter. Corresponding shell scripts can be found at [full_freeze_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp). Support `disable_tqdm`, `lazy_tokenize`, `preprocess_num_proc` parameters, for details please refer to [Command-Line parameters](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md). - 2023.12.8: Support [sus-34b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/sus_34b_chat), support yi-6b-200k, yi-34b-200k. - 2023.12.7: Support [Multi-Node DDP training](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E5%BE%AE%E8%B0%83%E6%96%87%E6%A1%A3.md#%E4%BD%BF%E7%94%A8cli). @@ -96,7 +97,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用 - 🔥 2023.10.27: Support for **chatglm3** series models: chatglm3-6b-base, chatglm3-6b, chatglm3-6b-32k. The corresponding shell script can be found in [chatglm3_6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/chatglm3_6b). - 🔥 2023.10.17: Supported **int4**, **int8** models: qwen-7b-chat-int4, qwen-14b-chat-int4, qwen-vl-chat-int4, baichuan2-7b-chat-int4, baichuan2-13b-chat-int4, qwen-7b-chat-int8, qwen-14b-chat-int8. - 2023.10.15: Supported **ziya2-13b** model series: ziya2-13b, ziya2-13b-chat. -- 2023.10.12: Supported **mistral-7b** model series: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-chat. +- 2023.10.12: Supported **mistral-7b** model series: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-instruct. - 🔥 2023.10.7: Supported **DeepSpeed ZeRO-2**, enabling LoRA (not just QLoRA) to run DDP on 2*A10. - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en. - 🔥 2023.9.25: Supported **qwen-14b** model series: qwen-14b, qwen-14b-chat. @@ -128,15 +129,15 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用 - Multi-Modal: - qwen-vl series: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary), [qwen-vl-chat-int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary) - qwen-audio series: [qwen-audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary), [qwen-audio-chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary) - - Zhipu series: [cogagent-chat](https://www.modelscope.cn/models/ZhipuAI/cogagent-chat/summary), [cogagent-vqa](https://www.modelscope.cn/models/ZhipuAI/cogagent-vqa/summary) + - cogagent series: [cogagent-chat](https://www.modelscope.cn/models/ZhipuAI/cogagent-chat/summary), [cogagent-vqa](https://www.modelscope.cn/models/ZhipuAI/cogagent-vqa/summary) - General: - qwen series: [qwen-1_8b-chat](https://modelscope.cn/models/qwen/Qwen-1_8B/summary), [qwen-1_8b-chat-int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary), [qwen-1_8b-chat-int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary), [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-7b-chat-int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary), [qwen-7b-chat-int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary), [qwen-14b-chat-int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary), [qwen-14b-chat-int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary), [qwen-72b](https://modelscope.cn/models/qwen/Qwen-72B/summary), [qwen-72b-chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary), [qwen-72b-chat-int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary), [qwen-72b-chat-int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary) - chatglm series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary) - llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary) - yi series: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-6b-200k](https://modelscope.cn/models/01ai/Yi-6B-200K/summary), [yi-6b-chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary), [yi-34b-200k](https://modelscope.cn/models/01ai/Yi-34B-200K/summary), [yi-34b-chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary) - - deepseek series: [deepseek-7b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary), [deepseek-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary), [deepseek-67b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary), [deepseek-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary) + - deepseek series: [deepseek-7b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary), [deepseek-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary), [deepseek-67b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary), [deepseek-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary), [deepseek-moe-16b](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary), [deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary) - openbuddy series: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary), [openbuddy-zephyr-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary), [openbuddy-deepseek-67b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary) - - mistral series: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-chat](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary), [mistral-7b-chat-v2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary), [mixtral-7b-moe](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary), [mixtral-7b-moe-chat](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary) + - mistral series: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-instruct](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary), [mistral-7b-instruct-v2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary), [mixtral-moe-7b](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary), [mixtral-moe-7b-instruct](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary) - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary) - internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary) - xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary) @@ -150,13 +151,13 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用 - tongyi-finance series: [tongyi-finance-14b](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary), [tongyi-finance-14b-chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary), [tongyi-finance-14b-chat-int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary) - Coding: - codefuse series: [codefuse-codellama-34b-chat](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary) - - deepseek-coder series: [deepseek-coder-1_3b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary), [deepseek-coder-1_3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary), [deepseek-coder-6_7b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary), [deepseek-coder-6_7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary), [deepseek-coder-33b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary), [deepseek-coder-33b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary) + - deepseek-coder series: [deepseek-coder-1_3b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary), [deepseek-coder-1_3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary), [deepseek-coder-6_7b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary), [deepseek-coder-6_7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary), [deepseek-coder-33b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary), [deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary) - phi series: [phi2-3b](https://modelscope.cn/models/AI-ModelScope/phi-2/summary) - Supported Datasets: [[Detail]](https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md#%E6%95%B0%E6%8D%AE%E9%9B%86) - NLP: - General: 🔥[alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), 🔥[alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [tutu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary), [wikipedia-zh](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary), [open-orca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary), [open-orca-gpt4](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary), [sharegpt-gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary) - Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), 🔥[damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), 🔥[agent-instruct-all-en](https://modelscope.cn/datasets/ZhipuAI/AgentInstruct/summary) - - RLHF: [hh-rlhf](https://www.modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary), [stack-exchange-paired](https://www.modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary) + - RLHF: 🔥[hh-rlhf](https://www.modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary), [stack-exchange-paired](https://www.modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary) - Coding: [code-alpaca-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), 🔥[leetcode-python-en](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary), 🔥[codefuse-python-en](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), 🔥[codefuse-evol-instruction-zh](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary) - Medical: [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary) - Law: 🔥[lawyer-llama-zh](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary), [tigerbot-law-zh](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary) diff --git a/README_CN.md b/README_CN.md index 1c54b84fc1..e292fcb51e 100644 --- a/README_CN.md +++ b/README_CN.md @@ -60,7 +60,8 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展 用户可以查看 [SWIFT官方文档](docs/source/GetStarted/快速使用.md) 来了解详细信息。 ## 🎉 新闻 -- 2023.1.4: 支持**VLLM部署**, 兼容**OpenAI API**样式, 具体可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署). +- 🔥2023.1.12: 支持**deepseek-moe**系列: deepseek-moe-16b, [deepseek-moe-16b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/deepseek_moe_16b_chat). +- 🔥2023.1.4: 支持**VLLM部署**, 兼容**OpenAI API**样式, 具体可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署). - 2023.1.4: 更新[Benchmark](https://github.com/modelscope/swift/blob/main/docs/source/LLM/Benchmark.md), 方便查看不同模型训练的速度和所需显存. - 🔥 2023.12.29: 支持web-ui进行sft训练和推理,安装ms-swift后使用`swift web-ui`开启 - 🔥 2023.12.29: 支持 DPO RLHF(Reinforcement Learning from Human Feedback) 和两个用于此任务的数据集: AI-ModelScope/stack-exchange-paired 以及 AI-ModelScope/hh-rlhf. 查看[文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E4%BA%BA%E7%B1%BB%E5%AF%B9%E9%BD%90%E8%AE%AD%E7%BB%83%E6%96%87%E6%A1%A3.md)开启训练! @@ -68,8 +69,8 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展 - 2023.12.23: 支持[codegeex2-6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/codegeex2_6b). - 2023.12.19: 支持[phi2-3b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/phi2_3b). - 2023.12.18: 支持VLLM进行推理加速. -- 2023.12.15: 支持deepseek, deepseek-coder系列: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-chat, deepseek-coder-6_7b, deepseek-coder-6_7b-chat, deepseek-coder-33b, deepseek-coder-33b-chat. -- 2023.12.13: 支持mistral-7b-chat-v2, [mixtral-7b-moe](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe), [mixtral-7b-moe-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe_chat). +- 2023.12.15: 支持deepseek, deepseek-coder系列: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-instruct, deepseek-coder-6_7b, deepseek-coder-6_7b-instruct, deepseek-coder-33b, deepseek-coder-33b-instruct. +- 2023.12.13: 支持mistral-7b-instruct-v2, [mixtral-moe-7b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe), [mixtral-moe-7b-instruct](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe_instruct). - 2023.12.9: 支持`freeze_parameters`参数, 作为lora和全参数训练的折中方案. 对应的sh可以查看[full_freeze_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp). 支持`disable_tqdm`, `lazy_tokenize`, `preprocess_num_proc`参数, 具体可以查看[命令行参数](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md). - 2023.12.8: 支持[sus-34b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/sus_34b_chat), 支持yi-6b-200k, yi-34b-200k. - 2023.12.7: 支持[Multi-Node DDP训练](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E5%BE%AE%E8%B0%83%E6%96%87%E6%A1%A3.md#%E4%BD%BF%E7%94%A8cli). @@ -94,7 +95,7 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展 - 🔥 2023.10.27: 支持**chatglm3**系列模型: chatglm3-6b-base, chatglm3-6b, chatglm3-6b-32k. 对应的sh脚本可以查看[chatglm3_6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/chatglm3_6b). - 🔥 2023.10.17: 支持**int4**, **int8**模型的SFT: qwen-7b-chat-int4, qwen-14b-chat-int4, qwen-vl-chat-int4, baichuan2-7b-chat-int4, baichuan2-13b-chat-int4, qwen-7b-chat-int8, qwen-14b-chat-int8. - 2023.10.15: 支持**ziya2-13b**系列模型: ziya2-13b, ziya2-13b-chat. -- 2023.10.12: 支持**mistral-7b**系列模型: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-chat. +- 2023.10.12: 支持**mistral-7b**系列模型: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-instruct. - 🔥 2023.10.7: 支持**DeepSpeed ZeRO-2**, 使得lora(不仅仅是qlora)可以在双卡A10上运行DDP. - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en. - 🔥 2023.9.25: 支持**qwen-14b**系列: qwen-14b, qwen-14b-chat. @@ -126,15 +127,15 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展 - 多模态: - qwen-vl 系列: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary), [qwen-vl-chat-int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary) - qwen-audio 系列: [qwen-audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary), [qwen-audio-chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary) - - Zhipu多模态模型: [cogagent-chat](https://www.modelscope.cn/models/ZhipuAI/cogagent-chat/summary), [cogagent-vqa](https://www.modelscope.cn/models/ZhipuAI/cogagent-vqa/summary) + - cogagent 系列: [cogagent-chat](https://www.modelscope.cn/models/ZhipuAI/cogagent-chat/summary), [cogagent-vqa](https://www.modelscope.cn/models/ZhipuAI/cogagent-vqa/summary) - 通用: - qwen 系列: [qwen-1_8b-chat](https://modelscope.cn/models/qwen/Qwen-1_8B/summary), [qwen-1_8b-chat-int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary), [qwen-1_8b-chat-int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary), [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-7b-chat-int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary), [qwen-7b-chat-int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary), [qwen-14b-chat-int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary), [qwen-14b-chat-int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary), [qwen-72b](https://modelscope.cn/models/qwen/Qwen-72B/summary), [qwen-72b-chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary), [qwen-72b-chat-int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary), [qwen-72b-chat-int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary) - chatglm 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary) - llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary) - yi 系列: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-6b-200k](https://modelscope.cn/models/01ai/Yi-6B-200K/summary), [yi-6b-chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary), [yi-34b-200k](https://modelscope.cn/models/01ai/Yi-34B-200K/summary), [yi-34b-chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary) - - deepseek 系列: [deepseek-7b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary), [deepseek-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary), [deepseek-67b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary), [deepseek-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary) + - deepseek 系列: [deepseek-7b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary), [deepseek-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary), [deepseek-67b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary), [deepseek-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary), [deepseek-moe-16b](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary), [deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary) - openbuddy 系列: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary), [openbuddy-zephyr-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary), [openbuddy-deepseek-67b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary) - - mistral 系列: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-chat](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary), [mistral-7b-chat-v2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary), [mixtral-7b-moe](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary), [mixtral-7b-moe-chat](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary) + - mistral 系列: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-instruct](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary), [mistral-7b-instruct-v2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary), [mixtral-moe-7b](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary), [mixtral-moe-7b-instruct](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary) - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary) - internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary) - xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary) @@ -148,13 +149,13 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展 - tongyi-finance 系列: [tongyi-finance-14b](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary), [tongyi-finance-14b-chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary), [tongyi-finance-14b-chat-int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary) - 代码: - codefuse 系列: [codefuse-codellama-34b-chat](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary) - - deepseek-coder 系列: [deepseek-coder-1_3b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary), [deepseek-coder-1_3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary), [deepseek-coder-6_7b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary), [deepseek-coder-6_7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary), [deepseek-coder-33b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary), [deepseek-coder-33b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary) + - deepseek-coder 系列: [deepseek-coder-1_3b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary), [deepseek-coder-1_3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary), [deepseek-coder-6_7b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary), [deepseek-coder-6_7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary), [deepseek-coder-33b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary), [deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary) - phi 系列: [phi2-3b](https://modelscope.cn/models/AI-ModelScope/phi-2/summary) - 支持的数据集: [[详细]](https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md#%E6%95%B0%E6%8D%AE%E9%9B%86) - NLP: - 通用: 🔥[alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), 🔥[alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [tutu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary), [wikipedia-zh](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary), [open-orca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary), [open-orca-gpt4](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary), [sharegpt-gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary) - Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), 🔥[damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), 🔥[agent-instruct-all-en](https://modelscope.cn/datasets/ZhipuAI/AgentInstruct/summary) - - RLHF: [hh-rlhf](https://www.modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary), [stack-exchange-paired](https://www.modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary) + - RLHF: 🔥[hh-rlhf](https://www.modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary), [stack-exchange-paired](https://www.modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary) - 代码: [code-alpaca-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), 🔥[leetcode-python-en](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary), 🔥[codefuse-python-en](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), 🔥[codefuse-evol-instruction-zh](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary) - 医疗: [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary) - 法律: 🔥[lawyer-llama-zh](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary), [tigerbot-law-zh](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary) diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" index e5c212169b..7d36876773 100644 --- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" +++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" @@ -413,10 +413,10 @@ CUDA_VISIBLE_DEVICES=0 swift app-ui --model_type qwen-7b-chat import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' -from swift.llm import InferArguments, ModelType, app_ui_main +from swift.llm import AppUIArguments, ModelType, app_ui_main -infer_args = InferArguments(model_type=ModelType.qwen_7b_chat) -app_ui_main(infer_args) +app_ui_args = AppUIArguments(model_type=ModelType.qwen_7b_chat) +app_ui_main(app_ui_args) ``` 使用bnb量化: @@ -424,10 +424,10 @@ app_ui_main(infer_args) import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' -from swift.llm import InferArguments, ModelType, app_ui_main +from swift.llm import AppUIArguments, ModelType, app_ui_main -infer_args = InferArguments(model_type=ModelType.qwen_7b_chat, quantization_bit=4) -app_ui_main(infer_args) +app_ui_args = AppUIArguments(model_type=ModelType.qwen_7b_chat, quantization_bit=4) +app_ui_main(app_ui_args) ``` ### qwen-7b @@ -441,10 +441,10 @@ CUDA_VISIBLE_DEVICES=0 swift app-ui --model_type qwen-7b import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' -from swift.llm import InferArguments, ModelType, app_ui_main +from swift.llm import AppUIArguments, ModelType, app_ui_main -infer_args = InferArguments(model_type=ModelType.qwen_7b) -app_ui_main(infer_args) +app_ui_args = AppUIArguments(model_type=ModelType.qwen_7b) +app_ui_main(app_ui_args) ``` ### 微调后模型 diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 052fd250d8..33a076c5df 100644 --- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -1,10 +1,12 @@ # 命令行参数 ## 目录 -- [sft 命令行参数](#sft-命令行参数) -- [merge-lora infer app-ui 命令行参数](#merge-lora-infer-app-ui-命令行参数) -- [deploy 命令行参数](#deploy-命令行参数) +- [SFT 参数](#SFT-参数) +- [DPO 参数](#DPO-参数) +- [merge-lora infer 参数](#merge-lora-infer-参数) +- [app-ui 参数](#app-ui-参数) +- [deploy 参数](#deploy-参数) -## sft 命令行参数 +## SFT 参数 - `--model_type`: 表示你选择的模型类型, 默认是`None`. 如果没有指定`model_id_or_path`, 则抛出异常. 如果指定了`model_id_or_path`, 则会根据`model_id_or_path`以及`MODEL_MAPPING`推断`model_type`. `model_type`和`model_id_or_path`这两个参数不能同时指定. 可以选择的`model_type`可以查看`MODEL_MAPPING.keys()`. - `--model_id_or_path`: 表示模型在ModelScope Hub中的`model_id`, 不区分大小写, 默认为`None`. 如果`--model_id_or_path`未被注册, 则会抛出异常. 你可以使用`model_type`的方式指定模型类型, 也可以通过`model_id_or_path`的方式指定模型类型. - `--model_revision`: 表示模型在ModelScope Hub中对应`model_id`的版本号, 默认为`None`. `model_revision`指定为`None`, 则使用注册在`MODEL_MAPPING`中的revision. 否则强制使用命令行传入的`model_revision`. @@ -92,15 +94,15 @@ - `--repetition_penalty`: 默认为`1.05`. 该参数只有在`predict_with_generate`设置为True的时候才生效. - `--num_beams`: 默认为`1`. 该参数只有在`predict_with_generate`设置为True的时候才生效. -## DPO参数 +## DPO 参数 -DPO参数继承了上面的SFT参数, 除此之外增加了以下参数: +dpo参数继承了sft参数, 除此之外增加了以下参数: -- `--ref_model_type` 对比模型类型, 可以选择的`model_type`可以查看`MODEL_MAPPING.keys()`. -- `--max_prompt_length` 最大的提示长度, 该参数会传入DPOTrainer中, 使prompt长度不超过该值的设置, 默认值1024. +- `--ref_model_type` 对比模型的类型, 可以选择的`model_type`可以查看`MODEL_MAPPING.keys()`. +- `--max_prompt_length` 最大的提示长度, 该参数会传入DPOTrainer中, 使prompt长度不超过该值的设置, 默认值`1024`. -## merge-lora infer app-ui 命令行参数 +## merge-lora infer 参数 - `--model_type`: 默认值为`None`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. - `--model_id_or_path`: 默认值为`None`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 推荐使用model_type的方式指定. - `--model_revision`: 默认值为`None`. 具体的参数介绍可以在`sft.sh命令行参数`中查看. 如果`model_id_or_path`为None或者是本地的模型目录, 则该参数失效. @@ -142,14 +144,23 @@ DPO参数继承了上面的SFT参数, 除此之外增加了以下参数: - `--save_safetensors`: 保存成`safetensors`文件还是`bin`文件. 默认为`True`. - `--overwrite_generation_config`: 是否将评估所使用的generation_config保存成`generation_config.json`文件, 默认为`None`. 如果指定了`ckpt_dir`, 则设置为`True`, 否则设置为`False`. 训练时保存的generation_config文件将被覆盖. - `--verbose`: 如果设置为False, 则使用tqdm样式推理. 如果设置为True, 则输出推理的query, response, label. 默认为`None`, 进行自动选择, 即`len(val_dataset) >= 100`时, 设置为False, 否则设置为True. 该参数只有在使用数据集评估时生效. -- `--share`: 传递给gradio的`demo.queue().launch(...)`函数. 该参数只有在使用`app-ui`时才生效. - `--gpu_memory_utilization`: 初始化vllm引擎`EngineArgs`的参数, 默认为`0.9`. 该参数只有在使用vllm时才生效. - `--tensor_parallel_size`: 初始化vllm引擎`EngineArgs`的参数, 默认为`1`. 该参数只有在使用vllm时才生效. -## deploy 命令行参数 +## app-ui 参数 + +app-ui参数继承了infer参数, 除此之外增加了以下参数: + +- `server_name`: 默认为`'127.0.0.1'`. 传递给gradio的`demo.queue().launch(...)`函数. +- `server_port`: 默认为`7860`. 传递给gradio的`demo.queue().launch(...)`函数. +- `share`: 默认为`False`. 传递给gradio的`demo.queue().launch(...)`函数. + +## deploy 参数 + +deploy参数继承了infer参数, 除此之外增加了以下参数: + - `--host`: 默认为`'127.0.0.1`. - `--port`: 默认为`8000`. - `--ssl_keyfile`: 默认为`None`. - `--ssl_certfile`: 默认为`None`. -- 其他参数继承自infer的命令行参数. diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 51e0b33168..090af658de 100644 --- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -55,6 +55,8 @@ |yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|✔|✔|| |deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|| |deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔|| +|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|| +|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✘|| |deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|| |deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|✔|✔|| |openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|| @@ -64,10 +66,10 @@ |openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|transformers>=4.34| |openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|✔|✔|| |mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.34| -|mistral-7b-chat|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34| -|mistral-7b-chat-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34| -|mixtral-7b-moe|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36| -|mixtral-7b-moe-chat|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.36| +|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34| +|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.34| +|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|transformers>=4.36| +|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|✔|✔|transformers>=4.36| |baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|✘|✔|transformers<4.34| |baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|✘|✔|transformers<4.34| |baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|✘|✔|transformers<4.34| @@ -104,11 +106,11 @@ |tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|✔|✘|auto_gptq>=0.5| |codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|✔|✔|| |deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|| -|deepseek-coder-1_3b-chat|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔|| +|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔|| |deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|| -|deepseek-coder-6_7b-chat|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔|| +|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔|| |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔|| -|deepseek-coder-33b-chat|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔|| +|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|✔|✔|| |phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|✔|✔|| |cogagent-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent|✘|✘|| |cogagent-vqa|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent|✘|✘|| @@ -172,5 +174,8 @@ |ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner| |coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision| |🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision| +|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision| |aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio| |🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio| +|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise| +|hh-rlhf|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42537|2312|163.4±117.7, min=27, max=964|hfrl, dpo, pairwise| diff --git "a/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" index fe30db7974..008bdc1299 100644 --- "a/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" +++ "b/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" @@ -260,14 +260,14 @@ CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'qwen-7b-chat/vx-xxx/checkpoint-xx import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' -from swift.llm import InferArguments, merge_lora_main, app_ui_main +from swift.llm import AppUIArguments, merge_lora_main, app_ui_main best_model_checkpoint = 'qwen-7b-chat/vx-xxx/checkpoint-xxx' -infer_args = InferArguments( +app_ui_args = AppUIArguments( ckpt_dir=best_model_checkpoint, eval_human=True) -# merge_lora_main(infer_args) -result = app_ui_main(infer_args) +# merge_lora_main(app_ui_args) +result = app_ui_main(app_ui_args) ``` 使用CLI: diff --git a/examples/pytorch/llm/app.py b/examples/pytorch/llm/app.py index c9a2083033..9a746ac7d4 100644 --- a/examples/pytorch/llm/app.py +++ b/examples/pytorch/llm/app.py @@ -3,14 +3,14 @@ # os.environ['CUDA_VISIBLE_DEVICES'] = '0' import custom -from swift.llm import InferArguments, ModelType, app_ui_main +from swift.llm import AppUIArguments, ModelType, app_ui_main if __name__ == '__main__': # Please refer to the `infer.sh` for setting the parameters. # text-generation - # args = InferArguments(model_type=ModelType.chatglm3_6b_base) + # args = AppUIArguments(model_type=ModelType.chatglm3_6b_base) # or chat - args = InferArguments(model_type=ModelType.qwen_7b_chat_int4) + args = AppUIArguments(model_type=ModelType.qwen_7b_chat_int4) # or load from ckpt dir - # args = InferArguments(ckpt_dir='xxx/vx_xxx/checkpoint-xxx') + # args = AppUIArguments(ckpt_dir='xxx/vx_xxx/checkpoint-xxx') app_ui_main(args) diff --git a/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/infer.sh new file mode 100644 index 0000000000..146337506a --- /dev/null +++ b/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/infer.sh @@ -0,0 +1,12 @@ +# Experimental environment: A100 +CUDA_VISIBLE_DEVICES=0 \ +swift infer \ + --ckpt_dir "output/deepseek-moe-16b-chat/vx_xxx/checkpoint-xxx" \ + --load_dataset_config true \ + --max_length 4096 \ + --use_flash_attn true \ + --max_new_tokens 2048 \ + --temperature 0.1 \ + --top_p 0.7 \ + --repetition_penalty 1.05 \ + --do_sample true \ diff --git a/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/sft.sh new file mode 100644 index 0000000000..3a0828cb33 --- /dev/null +++ b/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/sft.sh @@ -0,0 +1,12 @@ +# Experimental environment: A100 +# 52GB GPU memory +CUDA_VISIBLE_DEVICES=0 \ +swift sft \ + --model_type deepseek-moe-16b-chat \ + --dataset damo-agent-mini-zh \ + --train_dataset_sample 20000 \ + --max_length 4096 \ + --gradient_checkpointing true \ + --eval_steps 100 \ + --use_flash_attn true \ + --output_dir output \ diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/infer.sh similarity index 85% rename from examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh rename to examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/infer.sh index ebbd08ee99..e8e1c112b4 100644 --- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh +++ b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/infer.sh @@ -3,7 +3,7 @@ PYTHONPATH=../../.. \ CUDA_VISIBLE_DEVICES=0 \ python llm_infer.py \ - --ckpt_dir "output/mistral-7b-chat/vx_xxx/checkpoint-xxx" \ + --ckpt_dir "output/mistral-7b-instruct/vx_xxx/checkpoint-xxx" \ --load_dataset_config true \ --max_length 4096 \ --max_new_tokens 2048 \ diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/sft.sh similarity index 96% rename from examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/sft.sh rename to examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/sft.sh index 63590f83d7..363fc92d7d 100644 --- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/sft.sh +++ b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/sft.sh @@ -37,7 +37,7 @@ torchrun \ --save_total_limit 2 \ --logging_steps 10 \ --push_to_hub false \ - --hub_model_id mistral-7b-chat-lora \ + --hub_model_id mistral-7b-instruct-lora \ --hub_private_repo true \ --hub_token 'your-sdk-token' \ --deepspeed_config_path 'ds_config/zero2.json' \ diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/infer.sh similarity index 85% rename from examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh rename to examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/infer.sh index a3c6932be7..0ac3874235 100644 --- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/infer.sh @@ -3,7 +3,7 @@ PYTHONPATH=../../.. \ CUDA_VISIBLE_DEVICES=0 \ python llm_infer.py \ - --ckpt_dir "output/mistral-7b-chat/vx_xxx/checkpoint-xxx" \ + --ckpt_dir "output/mistral-7b-instruct/vx_xxx/checkpoint-xxx" \ --load_dataset_config true \ --max_length 4096 \ --max_new_tokens 2048 \ diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/sft.sh similarity index 96% rename from examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/sft.sh rename to examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/sft.sh index cd0dfbd6cd..77e1d83562 100644 --- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/sft.sh @@ -37,6 +37,6 @@ torchrun \ --save_total_limit 2 \ --logging_steps 10 \ --push_to_hub false \ - --hub_model_id mistral-7b-chat-lora \ + --hub_model_id mistral-7b-instruct-lora \ --hub_private_repo true \ --hub_token 'your-sdk-token' \ diff --git a/examples/pytorch/llm/scripts/mixtral_7b_moe/lora/infer.sh b/examples/pytorch/llm/scripts/mixtral_moe_7b/lora/infer.sh similarity index 85% rename from examples/pytorch/llm/scripts/mixtral_7b_moe/lora/infer.sh rename to examples/pytorch/llm/scripts/mixtral_moe_7b/lora/infer.sh index b8299c8e77..778db911ee 100644 --- a/examples/pytorch/llm/scripts/mixtral_7b_moe/lora/infer.sh +++ b/examples/pytorch/llm/scripts/mixtral_moe_7b/lora/infer.sh @@ -3,7 +3,7 @@ PYTHONPATH=../../.. \ CUDA_VISIBLE_DEVICES=0,1 \ python llm_infer.py \ - --ckpt_dir "output/mixtral-7b-moe/vx_xxx/checkpoint-xxx" \ + --ckpt_dir "output/mixtral-moe-7b/vx_xxx/checkpoint-xxx" \ --load_dataset_config true \ --max_length 2048 \ --use_flash_attn true \ diff --git a/examples/pytorch/llm/scripts/mixtral_7b_moe/lora/sft.sh b/examples/pytorch/llm/scripts/mixtral_moe_7b/lora/sft.sh similarity index 100% rename from examples/pytorch/llm/scripts/mixtral_7b_moe/lora/sft.sh rename to examples/pytorch/llm/scripts/mixtral_moe_7b/lora/sft.sh diff --git a/examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/infer.sh b/examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/infer.sh similarity index 83% rename from examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/infer.sh rename to examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/infer.sh index c30c0aaf6b..40e0e35233 100644 --- a/examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/infer.sh +++ b/examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/infer.sh @@ -3,7 +3,7 @@ PYTHONPATH=../../.. \ CUDA_VISIBLE_DEVICES=0,1 \ python llm_infer.py \ - --ckpt_dir "output/mixtral-7b-moe-chat/vx_xxx/checkpoint-xxx" \ + --ckpt_dir "output/mixtral-moe-7b-instruct/vx_xxx/checkpoint-xxx" \ --load_dataset_config true \ --max_length 2048 \ --use_flash_attn true \ diff --git a/examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/sft.sh b/examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/sft.sh similarity index 100% rename from examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/sft.sh rename to examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/sft.sh diff --git a/swift/llm/app_ui.py b/swift/llm/app_ui.py index b3896d338a..eafcc831a9 100644 --- a/swift/llm/app_ui.py +++ b/swift/llm/app_ui.py @@ -3,7 +3,7 @@ from swift.utils import get_main from .infer import merge_lora, prepare_model_template -from .utils import (History, InferArguments, inference_stream, +from .utils import (AppUIArguments, History, inference_stream, limit_history_length) @@ -11,7 +11,7 @@ def clear_session() -> History: return [] -def gradio_generation_demo(args: InferArguments) -> None: +def gradio_generation_demo(args: AppUIArguments) -> None: import gradio as gr if args.infer_backend == 'vllm': from swift.llm import prepare_vllm_engine_template, inference_stream_vllm, inference_vllm @@ -43,10 +43,18 @@ def model_generation(query: str) -> str: output_box = gr.Textbox(lines=16, label='Output', max_lines=16) send = gr.Button('🚀 发送') send.click(model_generation, inputs=[input_box], outputs=[output_box]) - demo.queue().launch(height=1000, share=args.share) - - -def gradio_chat_demo(args: InferArguments) -> None: + # Compatible with InferArguments + share = getattr(args, 'share', False) + server_name = getattr(args, 'server_name', '127.0.0.1') + server_port = getattr(args, 'server_port', 7860) + demo.queue().launch( + height=1000, + share=share, + server_name=server_name, + server_port=server_port) + + +def gradio_chat_demo(args: AppUIArguments) -> None: import gradio as gr if args.infer_backend == 'vllm': from swift.llm import prepare_vllm_engine_template, inference_stream_vllm @@ -86,10 +94,18 @@ def model_chat(query: str, history: History) -> Tuple[str, History]: model_chat, inputs=[message, chatbot], outputs=[message, chatbot]) clear_history.click( fn=clear_session, inputs=[], outputs=[chatbot], queue=False) - demo.queue().launch(height=1000, share=args.share) - - -def llm_app_ui(args: InferArguments) -> None: + # Compatible with InferArguments + share = getattr(args, 'share', False) + server_name = getattr(args, 'server_name', '127.0.0.1') + server_port = getattr(args, 'server_port', 7860) + demo.queue().launch( + height=1000, + share=share, + server_name=server_name, + server_port=server_port) + + +def llm_app_ui(args: AppUIArguments) -> None: args.eval_human = True if args.merge_lora_and_save: merge_lora(args, device_map='cpu') @@ -99,4 +115,4 @@ def llm_app_ui(args: InferArguments) -> None: gradio_chat_demo(args) -app_ui_main = get_main(InferArguments, llm_app_ui) +app_ui_main = get_main(AppUIArguments, llm_app_ui) diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py index 2726797467..5470565ad2 100644 --- a/swift/llm/utils/__init__.py +++ b/swift/llm/utils/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from .argument import (DeployArguments, DPOArguments, InferArguments, - RomeArguments, SftArguments, is_lora) +from .argument import (AppUIArguments, DeployArguments, DPOArguments, + InferArguments, RomeArguments, SftArguments, is_lora) from .client_utils import get_model_list_client, inference_client from .dataset import (DATASET_MAPPING, DatasetName, GetDatasetFunction, HfDataset, add_self_cognition_dataset, get_dataset, diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index 9fcb521c4f..896b94b06e 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -165,6 +165,9 @@ class SftArguments: top_p: float = 0.7 repetition_penalty: float = 1.05 num_beams: int = 1 + # compatibility hf + per_device_train_batch_size: Optional[int] = None + per_device_eval_batch_size: Optional[int] = None # compatibility. (Deprecated) only_save_model: Optional[bool] = None @@ -307,8 +310,9 @@ def __post_init__(self) -> None: 'support_gradient_checkpointing', True) if self.gradient_checkpointing is None: self.gradient_checkpointing = support_gradient_checkpointing - elif not support_gradient_checkpointing: - assert self.gradient_checkpointing is False, f'{self.model_type} not support gradient_checkpointing.' + elif not support_gradient_checkpointing and self.gradient_checkpointing is True: + logger.warning( + f'{self.model_type} not support gradient_checkpointing.') @dataclass @@ -374,8 +378,6 @@ class InferArguments: save_safetensors: bool = True overwrite_generation_config: Optional[bool] = None verbose: Optional[bool] = None - # app-ui - share: bool = False # vllm gpu_memory_utilization: float = 0.9 tensor_parallel_size: int = 1 @@ -473,6 +475,13 @@ def check_ckpt_dir_correct(ckpt_dir) -> bool: return os.path.isfile(os.path.join(ckpt_dir, 'configuration.json')) +@dataclass +class AppUIArguments(InferArguments): + server_name: str = '127.0.0.1' + server_port: int = 7860 + share: bool = False + + @dataclass class DeployArguments(InferArguments): host: str = '127.0.0.1' @@ -603,17 +612,21 @@ def handle_compatibility(args: Union[SftArguments, InferArguments]) -> None: args.template_type = 'chatglm-generation' if args.template_type == 'chatml': args.template_type = TemplateType.qwen - if (isinstance(args, InferArguments) and args.show_dataset_sample != 10 - and args.val_dataset_sample == 10): - # args.val_dataset_sample is the default value and args.show_dataset_sample is not the default value. - args.val_dataset_sample = args.show_dataset_sample if args.truncation_strategy == 'ignore': args.truncation_strategy = 'delete' - if isinstance(args, - InferArguments) and args.safe_serialization is not None: - args.save_safetensors = args.safe_serialization - if isinstance(args, SftArguments) and args.only_save_model is not None: - args.save_only_model = args.only_save_model + if isinstance(args, InferArguments): + if args.show_dataset_sample != 10 and args.val_dataset_sample == 10: + # args.val_dataset_sample is the default value and args.show_dataset_sample is not the default value. + args.val_dataset_sample = args.show_dataset_sample + if args.safe_serialization is not None: + args.save_safetensors = args.safe_serialization + if isinstance(args, SftArguments): + if args.only_save_model is not None: + args.save_only_model = args.only_save_model + if args.per_device_train_batch_size is not None: + args.batch_size = args.per_device_train_batch_size + if args.per_device_eval_batch_size is not None: + args.eval_batch_size = args.per_device_eval_batch_size def set_model_type(args: Union[SftArguments, InferArguments]) -> None: diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index ac93b279cb..3c8769f209 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -581,7 +581,7 @@ def reorganize_row_simple(sample) -> Dict[str, str]: [('harmless-base', 'test')], process_hh_rlhf, get_dataset_from_repo, - tags=['hfrl', 'dpo', 'pairwise']) + tags=['hfrl', 'dpo', 'pairwise', '🔥']) register_dataset( DatasetName.medical_zh, @@ -668,7 +668,7 @@ def add_system(row): [('default', 'validation')], _preprocess_capcha_images, get_dataset_from_repo, - tags=['chat', 'multi-modal', 'vision', '🔥']) + tags=['chat', 'multi-modal', 'vision']) register_dataset( DatasetName.cls_fudan_news_zh, diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 2feba9832f..5e8b81d8d9 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -81,6 +81,8 @@ class ModelType: # deepseek deepseek_7b = 'deepseek-7b' deepseek_7b_chat = 'deepseek-7b-chat' + deepseek_moe_16b = 'deepseek-moe-16b' + deepseek_moe_16b_chat = 'deepseek-moe-16b-chat' deepseek_67b = 'deepseek-67b' deepseek_67b_chat = 'deepseek-67b-chat' # openbuddy @@ -92,10 +94,10 @@ class ModelType: openbuddy_deepseek_67b_chat = 'openbuddy-deepseek-67b-chat' # mistral mistral_7b = 'mistral-7b' - mistral_7b_chat = 'mistral-7b-chat' - mistral_7b_chat_v2 = 'mistral-7b-chat-v2' - mixtral_7b_moe = 'mixtral-7b-moe' - mixtral_7b_moe_chat = 'mixtral-7b-moe-chat' + mistral_7b_instruct = 'mistral-7b-instruct' + mistral_7b_instruct_v2 = 'mistral-7b-instruct-v2' + mixtral_moe_7b = 'mixtral-moe-7b' + mixtral_moe_7b_instruct = 'mixtral-moe-7b-instruct' # baichuan baichuan_7b = 'baichuan-7b' baichuan_13b = 'baichuan-13b' @@ -147,11 +149,11 @@ class ModelType: codefuse_codellama_34b_chat = 'codefuse-codellama-34b-chat' # deepseek-coder deepseek_coder_1_3b = 'deepseek-coder-1_3b' - deepseek_coder_1_3b_chat = 'deepseek-coder-1_3b-chat' + deepseek_coder_1_3b_instruct = 'deepseek-coder-1_3b-instruct' deepseek_coder_6_7b = 'deepseek-coder-6_7b' - deepseek_coder_6_7b_chat = 'deepseek-coder-6_7b-chat' + deepseek_coder_6_7b_instruct = 'deepseek-coder-6_7b-instruct' deepseek_coder_33b = 'deepseek-coder-33b' - deepseek_coder_33b_chat = 'deepseek-coder-33b-chat' + deepseek_coder_33b_instruct = 'deepseek-coder-33b-instruct' # phi phi2_3b = 'phi2-3b' @@ -631,7 +633,7 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True) @register_model( - ModelType.deepseek_coder_1_3b_chat, + ModelType.deepseek_coder_1_3b_instruct, 'deepseek-ai/deepseek-coder-1.3b-instruct', LoRATM.llama2, TemplateType.deepseek_coder, @@ -639,7 +641,7 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True) @register_model( - ModelType.deepseek_coder_6_7b_chat, + ModelType.deepseek_coder_6_7b_instruct, 'deepseek-ai/deepseek-coder-6.7b-instruct', LoRATM.llama2, TemplateType.deepseek_coder, @@ -647,7 +649,7 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True) @register_model( - ModelType.deepseek_coder_33b_chat, + ModelType.deepseek_coder_33b_instruct, 'deepseek-ai/deepseek-coder-33b-instruct', LoRATM.llama2, TemplateType.deepseek_coder, @@ -800,7 +802,7 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True) @register_model( - ModelType.mistral_7b_chat, + ModelType.mistral_7b_instruct, 'AI-ModelScope/Mistral-7B-Instruct-v0.1', LoRATM.llama2, TemplateType.llama, @@ -808,7 +810,7 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True) @register_model( - ModelType.mistral_7b_chat_v2, + ModelType.mistral_7b_instruct_v2, 'AI-ModelScope/Mistral-7B-Instruct-v0.2', LoRATM.llama2, TemplateType.llama, @@ -824,7 +826,7 @@ def cross_entropy_forward(self, inputs: Tensor, support_flash_attn=True, support_vllm=True) @register_model( - ModelType.mixtral_7b_moe, + ModelType.mixtral_moe_7b, 'AI-ModelScope/Mixtral-8x7B-v0.1', LoRATM.llama2, TemplateType.default_generation_bos, @@ -833,7 +835,7 @@ def cross_entropy_forward(self, inputs: Tensor, support_vllm=True, support_gradient_checkpointing=False) @register_model( - ModelType.mixtral_7b_moe_chat, + ModelType.mixtral_moe_7b_instruct, 'AI-ModelScope/Mixtral-8x7B-Instruct-v0.1', LoRATM.llama2, TemplateType.llama, @@ -1371,6 +1373,44 @@ def get_model_tokenizer_phi(model_dir: str, load_model, model_config, **kwargs) +@register_model( + ModelType.deepseek_moe_16b_chat, + 'deepseek-ai/deepseek-moe-16b-chat', + LoRATM.llama2, + TemplateType.deepseek, + support_flash_attn=True) +@register_model( + ModelType.deepseek_moe_16b, + 'deepseek-ai/deepseek-moe-16b-base', + LoRATM.llama2, + TemplateType.default_generation_bos, + support_flash_attn=True) +def get_model_tokenizer_deepseek_moe(model_dir: str, + torch_dtype: Dtype, + model_kwargs: Dict[str, Any], + load_model: bool = True, + **kwargs): + model, tokenizer = get_model_tokenizer_with_flash_attn( + model_dir, torch_dtype, model_kwargs, load_model, **kwargs) + if model is not None: + # fix dtype bug + mlp_cls = model.model.layers[1].mlp.__class__ + if not hasattr(mlp_cls, '__old_forward'): # Avoid double patching + __old_forward = mlp_cls._old_forward if hasattr( + mlp_cls, '_old_forward') else mlp_cls.forward + + def _new_forward(self, hidden_states) -> Tensor: + dtype = hidden_states.dtype + return __old_forward(self, hidden_states).to(dtype) + + if hasattr(mlp_cls, '_old_forward'): # device_map + mlp_cls._old_forward = _new_forward + else: + mlp_cls.forward = _new_forward + mlp_cls.__old_forward = __old_forward + return model, tokenizer + + def fix_transformers_upgrade(module: PreTrainedModel) -> None: # from 4.35, transformers changes its arguments of _set_gradient_checkpointing if version.parse(transformers.__version__) >= version.parse('4.35'): diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index c4dfeac5c4..14e5914442 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -32,8 +32,8 @@ class TemplateType: zephyr = 'zephyr' sus = 'sus' deepseek = 'deepseek' - codefuse_codellama = 'codefuse-codellama' deepseek_coder = 'deepseek-coder' + codefuse_codellama = 'codefuse-codellama' cogagent = 'cogagent' # compatibility. (Deprecated) chatml = 'chatml'