diff --git a/README.md b/README.md
index e05aec137d..7871ca3c9b 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,8 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 
 
 ## 🎉 News
-- 2023.1.4: Support for **VLLM deployment**, compatible with the **OpenAI API** style. For more details, please refer to [VLLM Inference Acceleration and Deployment](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署)
+- 🔥2023.1.12: Support **deepseek-moe** series: deepseek-moe-16b, [deepseek-moe-16b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/deepseek_moe_16b_chat).
+- 🔥2023.1.4: Support for **VLLM deployment**, compatible with the **OpenAI API** style. For more details, please refer to [VLLM Inference Acceleration and Deployment](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署)
 - 2023.1.4: Update [Benchmark](https://github.com/modelscope/swift/blob/main/docs/source/LLM/Benchmark.md) to facilitate viewing the training speed and GPU memory required for different models.
 - 🔥 2023.12.29: Support web-ui for training and inference, use `swift web-ui` after the installation of ms-swift.
 - 🔥 2023.12.29: Support DPO RLHF(Reinforcement Learning from Human Feedback) and two datasets: AI-ModelScope/stack-exchange-paired and AI-ModelScope/hh-rlhf for this task. Check [this documentation](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E4%BA%BA%E7%B1%BB%E5%AF%B9%E9%BD%90%E8%AE%AD%E7%BB%83%E6%96%87%E6%A1%A3.md) to start training!
@@ -70,8 +71,8 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 - 2023.12.23: Support [codegeex2-6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/codegeex2_6b).
 - 2023.12.19: Support [phi2-3b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/phi2_3b).
 - 2023.12.18: Support for VLLM for inference acceleration.
-- 2023.12.15: Support deepseek, deepseek-coder series: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-chat, deepseek-coder-6_7b, deepseek-coder-6_7b-chat, deepseek-coder-33b, deepseek-coder-33b-chat.
-- 2023.12.13: Support mistral-7b-chat-v2, [mixtral-7b-moe](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe), [mixtral-7b-moe-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe_chat).
+- 2023.12.15: Support deepseek, deepseek-coder series: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-instruct, deepseek-coder-6_7b, deepseek-coder-6_7b-instruct, deepseek-coder-33b, deepseek-coder-33b-instruct.
+- 2023.12.13: Support mistral-7b-instruct-v2, [mixtral-moe-7b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_moe_7b), [mixtral-moe-7b-instruct](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_moe_7b_instruct).
 - 2023.12.9: Support the `freeze_parameters` parameter as a compromise between LoRA and full parameter. Corresponding shell scripts can be found at [full_freeze_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp). Support `disable_tqdm`, `lazy_tokenize`, `preprocess_num_proc` parameters, for details please refer to [Command-Line parameters](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
 - 2023.12.8: Support [sus-34b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/sus_34b_chat), support yi-6b-200k, yi-34b-200k.
 - 2023.12.7: Support [Multi-Node DDP training](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E5%BE%AE%E8%B0%83%E6%96%87%E6%A1%A3.md#%E4%BD%BF%E7%94%A8cli).
@@ -96,7 +97,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 - 🔥 2023.10.27: Support for **chatglm3** series models: chatglm3-6b-base, chatglm3-6b, chatglm3-6b-32k. The corresponding shell script can be found in [chatglm3_6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/chatglm3_6b).
 - 🔥 2023.10.17: Supported **int4**, **int8** models: qwen-7b-chat-int4, qwen-14b-chat-int4, qwen-vl-chat-int4, baichuan2-7b-chat-int4, baichuan2-13b-chat-int4, qwen-7b-chat-int8, qwen-14b-chat-int8.
 - 2023.10.15: Supported **ziya2-13b** model series: ziya2-13b, ziya2-13b-chat.
-- 2023.10.12: Supported **mistral-7b** model series: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-chat.
+- 2023.10.12: Supported **mistral-7b** model series: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-instruct.
 - 🔥 2023.10.7: Supported **DeepSpeed ZeRO-2**, enabling LoRA (not just QLoRA) to run DDP on 2*A10.
 - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
 - 🔥 2023.9.25: Supported **qwen-14b** model series: qwen-14b, qwen-14b-chat.
@@ -128,15 +129,15 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
   - Multi-Modal:
     - qwen-vl series: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary), [qwen-vl-chat-int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)
     - qwen-audio series: [qwen-audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary), [qwen-audio-chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)
-    - Zhipu series: [cogagent-chat](https://www.modelscope.cn/models/ZhipuAI/cogagent-chat/summary), [cogagent-vqa](https://www.modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)
+    - cogagent series: [cogagent-chat](https://www.modelscope.cn/models/ZhipuAI/cogagent-chat/summary), [cogagent-vqa](https://www.modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)
   - General:
     - qwen series: [qwen-1_8b-chat](https://modelscope.cn/models/qwen/Qwen-1_8B/summary), [qwen-1_8b-chat-int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary), [qwen-1_8b-chat-int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary), [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-7b-chat-int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary), [qwen-7b-chat-int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary), [qwen-14b-chat-int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary), [qwen-14b-chat-int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary), [qwen-72b](https://modelscope.cn/models/qwen/Qwen-72B/summary), [qwen-72b-chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary), [qwen-72b-chat-int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary), [qwen-72b-chat-int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)
     - chatglm series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)
     - llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
     - yi series: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-6b-200k](https://modelscope.cn/models/01ai/Yi-6B-200K/summary), [yi-6b-chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary), [yi-34b-200k](https://modelscope.cn/models/01ai/Yi-34B-200K/summary), [yi-34b-chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)
-    - deepseek series: [deepseek-7b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary), [deepseek-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary), [deepseek-67b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary), [deepseek-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)
+    - deepseek series: [deepseek-7b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary), [deepseek-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary), [deepseek-67b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary), [deepseek-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary), [deepseek-moe-16b](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary), [deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)
     - openbuddy series: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary), [openbuddy-zephyr-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary), [openbuddy-deepseek-67b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)
-    - mistral series: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-chat](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary), [mistral-7b-chat-v2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary), [mixtral-7b-moe](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary), [mixtral-7b-moe-chat](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)
+    - mistral series: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-instruct](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary), [mistral-7b-instruct-v2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary), [mixtral-moe-7b](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary), [mixtral-moe-7b-instruct](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)
     - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
     - internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
     - xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary)
@@ -150,13 +151,13 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
     - tongyi-finance series: [tongyi-finance-14b](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary), [tongyi-finance-14b-chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary), [tongyi-finance-14b-chat-int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)
   - Coding:
     - codefuse series: [codefuse-codellama-34b-chat](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)
-    - deepseek-coder series: [deepseek-coder-1_3b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary), [deepseek-coder-1_3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary), [deepseek-coder-6_7b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary), [deepseek-coder-6_7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary), [deepseek-coder-33b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary), [deepseek-coder-33b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)
+    - deepseek-coder series: [deepseek-coder-1_3b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary), [deepseek-coder-1_3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary), [deepseek-coder-6_7b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary), [deepseek-coder-6_7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary), [deepseek-coder-33b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary), [deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)
     - phi series: [phi2-3b](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)
 - Supported Datasets: [[Detail]](https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md#%E6%95%B0%E6%8D%AE%E9%9B%86)
   - NLP:
     - General: 🔥[alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), 🔥[alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [tutu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary), [wikipedia-zh](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary), [open-orca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary), [open-orca-gpt4](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary), [sharegpt-gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)
     - Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), 🔥[damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), 🔥[agent-instruct-all-en](https://modelscope.cn/datasets/ZhipuAI/AgentInstruct/summary)
-    - RLHF: [hh-rlhf](https://www.modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary), [stack-exchange-paired](https://www.modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)
+    - RLHF: 🔥[hh-rlhf](https://www.modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary), [stack-exchange-paired](https://www.modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)
     - Coding: [code-alpaca-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), 🔥[leetcode-python-en](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary), 🔥[codefuse-python-en](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), 🔥[codefuse-evol-instruction-zh](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)
     - Medical: [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary)
     - Law: 🔥[lawyer-llama-zh](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary), [tigerbot-law-zh](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)
diff --git a/README_CN.md b/README_CN.md
index 1c54b84fc1..e292fcb51e 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -60,7 +60,8 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 用户可以查看 [SWIFT官方文档](docs/source/GetStarted/快速使用.md) 来了解详细信息。
 
 ## 🎉 新闻
-- 2023.1.4: 支持**VLLM部署**, 兼容**OpenAI API**样式, 具体可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署).
+- 🔥2023.1.12: 支持**deepseek-moe**系列: deepseek-moe-16b, [deepseek-moe-16b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/deepseek_moe_16b_chat).
+- 🔥2023.1.4: 支持**VLLM部署**, 兼容**OpenAI API**样式, 具体可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署).
 - 2023.1.4: 更新[Benchmark](https://github.com/modelscope/swift/blob/main/docs/source/LLM/Benchmark.md), 方便查看不同模型训练的速度和所需显存.
 - 🔥 2023.12.29: 支持web-ui进行sft训练和推理，安装ms-swift后使用`swift web-ui`开启
 - 🔥 2023.12.29: 支持 DPO RLHF(Reinforcement Learning from Human Feedback) 和两个用于此任务的数据集: AI-ModelScope/stack-exchange-paired 以及 AI-ModelScope/hh-rlhf. 查看[文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E4%BA%BA%E7%B1%BB%E5%AF%B9%E9%BD%90%E8%AE%AD%E7%BB%83%E6%96%87%E6%A1%A3.md)开启训练！
@@ -68,8 +69,8 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 - 2023.12.23: 支持[codegeex2-6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/codegeex2_6b).
 - 2023.12.19: 支持[phi2-3b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/phi2_3b).
 - 2023.12.18: 支持VLLM进行推理加速.
-- 2023.12.15: 支持deepseek, deepseek-coder系列: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-chat, deepseek-coder-6_7b, deepseek-coder-6_7b-chat, deepseek-coder-33b, deepseek-coder-33b-chat.
-- 2023.12.13: 支持mistral-7b-chat-v2, [mixtral-7b-moe](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe), [mixtral-7b-moe-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe_chat).
+- 2023.12.15: 支持deepseek, deepseek-coder系列: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-instruct, deepseek-coder-6_7b, deepseek-coder-6_7b-instruct, deepseek-coder-33b, deepseek-coder-33b-instruct.
+- 2023.12.13: 支持mistral-7b-instruct-v2, [mixtral-moe-7b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe), [mixtral-moe-7b-instruct](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe_instruct).
 - 2023.12.9: 支持`freeze_parameters`参数, 作为lora和全参数训练的折中方案. 对应的sh可以查看[full_freeze_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp). 支持`disable_tqdm`, `lazy_tokenize`, `preprocess_num_proc`参数, 具体可以查看[命令行参数](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
 - 2023.12.8: 支持[sus-34b-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/sus_34b_chat), 支持yi-6b-200k, yi-34b-200k.
 - 2023.12.7: 支持[Multi-Node DDP训练](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E5%BE%AE%E8%B0%83%E6%96%87%E6%A1%A3.md#%E4%BD%BF%E7%94%A8cli).
@@ -94,7 +95,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 - 🔥 2023.10.27: 支持**chatglm3**系列模型: chatglm3-6b-base, chatglm3-6b, chatglm3-6b-32k. 对应的sh脚本可以查看[chatglm3_6b](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/chatglm3_6b).
 - 🔥 2023.10.17: 支持**int4**, **int8**模型的SFT: qwen-7b-chat-int4, qwen-14b-chat-int4, qwen-vl-chat-int4, baichuan2-7b-chat-int4, baichuan2-13b-chat-int4, qwen-7b-chat-int8, qwen-14b-chat-int8.
 - 2023.10.15: 支持**ziya2-13b**系列模型: ziya2-13b, ziya2-13b-chat.
-- 2023.10.12: 支持**mistral-7b**系列模型: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-chat.
+- 2023.10.12: 支持**mistral-7b**系列模型: openbuddy-mistral-7b-chat, mistral-7b, mistral-7b-instruct.
 - 🔥 2023.10.7: 支持**DeepSpeed ZeRO-2**, 使得lora(不仅仅是qlora)可以在双卡A10上运行DDP.
 - 2023.10.4: 支持更多数学, 法律, SQL, 代码领域的数据集: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
 - 🔥 2023.9.25: 支持**qwen-14b**系列: qwen-14b, qwen-14b-chat.
@@ -126,15 +127,15 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
   - 多模态:
     - qwen-vl 系列: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary), [qwen-vl-chat-int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)
     - qwen-audio 系列: [qwen-audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary), [qwen-audio-chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)
-    - Zhipu多模态模型: [cogagent-chat](https://www.modelscope.cn/models/ZhipuAI/cogagent-chat/summary), [cogagent-vqa](https://www.modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)
+    - cogagent 系列: [cogagent-chat](https://www.modelscope.cn/models/ZhipuAI/cogagent-chat/summary), [cogagent-vqa](https://www.modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)
   - 通用:
     - qwen 系列: [qwen-1_8b-chat](https://modelscope.cn/models/qwen/Qwen-1_8B/summary), [qwen-1_8b-chat-int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary), [qwen-1_8b-chat-int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary), [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-7b-chat-int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary), [qwen-7b-chat-int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary), [qwen-14b-chat-int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary), [qwen-14b-chat-int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary), [qwen-72b](https://modelscope.cn/models/qwen/Qwen-72B/summary), [qwen-72b-chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary), [qwen-72b-chat-int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary), [qwen-72b-chat-int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)
     - chatglm 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)
     - llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
     - yi 系列: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-6b-200k](https://modelscope.cn/models/01ai/Yi-6B-200K/summary), [yi-6b-chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary), [yi-34b-200k](https://modelscope.cn/models/01ai/Yi-34B-200K/summary), [yi-34b-chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)
-    - deepseek 系列: [deepseek-7b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary), [deepseek-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary), [deepseek-67b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary), [deepseek-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)
+    - deepseek 系列: [deepseek-7b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary), [deepseek-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary), [deepseek-67b](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary), [deepseek-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary), [deepseek-moe-16b](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary), [deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)
     - openbuddy 系列: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary), [openbuddy-zephyr-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary), [openbuddy-deepseek-67b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)
-    - mistral 系列: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-chat](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary), [mistral-7b-chat-v2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary), [mixtral-7b-moe](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary), [mixtral-7b-moe-chat](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)
+    - mistral 系列: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-instruct](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary), [mistral-7b-instruct-v2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary), [mixtral-moe-7b](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary), [mixtral-moe-7b-instruct](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)
     - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
     - internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
     - xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary)
@@ -148,13 +149,13 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
     - tongyi-finance 系列: [tongyi-finance-14b](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary), [tongyi-finance-14b-chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary), [tongyi-finance-14b-chat-int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)
   - 代码:
     - codefuse 系列: [codefuse-codellama-34b-chat](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)
-    - deepseek-coder 系列: [deepseek-coder-1_3b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary), [deepseek-coder-1_3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary), [deepseek-coder-6_7b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary), [deepseek-coder-6_7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary), [deepseek-coder-33b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary), [deepseek-coder-33b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)
+    - deepseek-coder 系列: [deepseek-coder-1_3b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary), [deepseek-coder-1_3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary), [deepseek-coder-6_7b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary), [deepseek-coder-6_7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary), [deepseek-coder-33b](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary), [deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)
     - phi 系列: [phi2-3b](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)
 - 支持的数据集: [[详细]](https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md#%E6%95%B0%E6%8D%AE%E9%9B%86)
   - NLP:
     - 通用: 🔥[alpaca-en](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-en/summary)(gpt4), 🔥[alpaca-zh](https://modelscope.cn/datasets/AI-ModelScope/alpaca-gpt4-data-zh/summary)(gpt4), [multi-alpaca-all](https://www.modelscope.cn/datasets/damo/nlp_polylm_multialpaca_sft/summary), [instinwild-en](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [instinwild-zh](https://www.modelscope.cn/datasets/wyj123456/instinwild/summary), [cot-en](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [cot-zh](https://www.modelscope.cn/datasets/YorickHe/CoT/summary), [firefly-all-zh](https://www.modelscope.cn/datasets/wyj123456/firefly/summary), [instruct-en](https://www.modelscope.cn/datasets/wyj123456/instruct/summary), [gpt4all-en](https://www.modelscope.cn/datasets/wyj123456/GPT4all/summary), [sharegpt-en](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [sharegpt-zh](https://www.modelscope.cn/datasets/huangjintao/sharegpt/summary), [tutu-v2-sft-mixture](https://modelscope.cn/datasets/AI-ModelScope/tulu-v2-sft-mixture/summary), [wikipedia-zh](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered/summary), [open-orca](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary), [open-orca-gpt4](https://modelscope.cn/datasets/AI-ModelScope/OpenOrca/summary), [sharegpt-gpt4](https://modelscope.cn/datasets/AI-ModelScope/sharegpt_gpt4/summary)
     - Agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), 🔥[damo-agent-mini-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), 🔥[agent-instruct-all-en](https://modelscope.cn/datasets/ZhipuAI/AgentInstruct/summary)
-    - RLHF: [hh-rlhf](https://www.modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary), [stack-exchange-paired](https://www.modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)
+    - RLHF: 🔥[hh-rlhf](https://www.modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary), [stack-exchange-paired](https://www.modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)
     - 代码: [code-alpaca-en](https://www.modelscope.cn/datasets/wyj123456/code_alpaca_en/summary), 🔥[leetcode-python-en](https://modelscope.cn/datasets/AI-ModelScope/leetcode-solutions-python/summary), 🔥[codefuse-python-en](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k/summary), 🔥[codefuse-evol-instruction-zh](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k/summary)
     - 医疗: [medical-en](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary), [medical-mini-zh](https://www.modelscope.cn/datasets/huangjintao/medical_zh/summary)
     - 法律: 🔥[lawyer-llama-zh](https://modelscope.cn/datasets/AI-ModelScope/lawyer_llama_data/summary), [tigerbot-law-zh](https://modelscope.cn/datasets/AI-ModelScope/tigerbot-law-plugin/summary)
diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
index e5c212169b..7d36876773 100644
--- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
@@ -413,10 +413,10 @@ CUDA_VISIBLE_DEVICES=0 swift app-ui --model_type qwen-7b-chat
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-from swift.llm import InferArguments, ModelType, app_ui_main
+from swift.llm import AppUIArguments, ModelType, app_ui_main
 
-infer_args = InferArguments(model_type=ModelType.qwen_7b_chat)
-app_ui_main(infer_args)
+app_ui_args = AppUIArguments(model_type=ModelType.qwen_7b_chat)
+app_ui_main(app_ui_args)
 ```
 
 使用bnb量化:
@@ -424,10 +424,10 @@ app_ui_main(infer_args)
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-from swift.llm import InferArguments, ModelType, app_ui_main
+from swift.llm import AppUIArguments, ModelType, app_ui_main
 
-infer_args = InferArguments(model_type=ModelType.qwen_7b_chat, quantization_bit=4)
-app_ui_main(infer_args)
+app_ui_args = AppUIArguments(model_type=ModelType.qwen_7b_chat, quantization_bit=4)
+app_ui_main(app_ui_args)
 ```
 
 ### qwen-7b
@@ -441,10 +441,10 @@ CUDA_VISIBLE_DEVICES=0 swift app-ui --model_type qwen-7b
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-from swift.llm import InferArguments, ModelType, app_ui_main
+from swift.llm import AppUIArguments, ModelType, app_ui_main
 
-infer_args = InferArguments(model_type=ModelType.qwen_7b)
-app_ui_main(infer_args)
+app_ui_args = AppUIArguments(model_type=ModelType.qwen_7b)
+app_ui_main(app_ui_args)
 ```
 
 ### 微调后模型
diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 052fd250d8..33a076c5df 100644
--- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -1,10 +1,12 @@
 # 命令行参数
 ## 目录
-- [sft 命令行参数](#sft-命令行参数)
-- [merge-lora infer app-ui 命令行参数](#merge-lora-infer-app-ui-命令行参数)
-- [deploy 命令行参数](#deploy-命令行参数)
+- [SFT 参数](#SFT-参数)
+- [DPO 参数](#DPO-参数)
+- [merge-lora infer 参数](#merge-lora-infer-参数)
+- [app-ui 参数](#app-ui-参数)
+- [deploy 参数](#deploy-参数)
 
-## sft 命令行参数
+## SFT 参数
 - `--model_type`: 表示你选择的模型类型, 默认是`None`. 如果没有指定`model_id_or_path`, 则抛出异常. 如果指定了`model_id_or_path`, 则会根据`model_id_or_path`以及`MODEL_MAPPING`推断`model_type`. `model_type`和`model_id_or_path`这两个参数不能同时指定. 可以选择的`model_type`可以查看`MODEL_MAPPING.keys()`.
 - `--model_id_or_path`: 表示模型在ModelScope Hub中的`model_id`, 不区分大小写, 默认为`None`. 如果`--model_id_or_path`未被注册, 则会抛出异常. 你可以使用`model_type`的方式指定模型类型, 也可以通过`model_id_or_path`的方式指定模型类型.
 - `--model_revision`: 表示模型在ModelScope Hub中对应`model_id`的版本号, 默认为`None`. `model_revision`指定为`None`, 则使用注册在`MODEL_MAPPING`中的revision. 否则强制使用命令行传入的`model_revision`.
@@ -92,15 +94,15 @@
 - `--repetition_penalty`: 默认为`1.05`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--num_beams`: 默认为`1`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 
-## DPO参数
+## DPO 参数
 
-DPO参数继承了上面的SFT参数, 除此之外增加了以下参数:
+dpo参数继承了sft参数, 除此之外增加了以下参数:
 
-- `--ref_model_type` 对比模型类型, 可以选择的`model_type`可以查看`MODEL_MAPPING.keys()`.
-- `--max_prompt_length` 最大的提示长度, 该参数会传入DPOTrainer中, 使prompt长度不超过该值的设置, 默认值1024.
+- `--ref_model_type` 对比模型的类型, 可以选择的`model_type`可以查看`MODEL_MAPPING.keys()`.
+- `--max_prompt_length` 最大的提示长度, 该参数会传入DPOTrainer中, 使prompt长度不超过该值的设置, 默认值`1024`.
 
 
-## merge-lora infer app-ui 命令行参数
+## merge-lora infer 参数
 - `--model_type`: 默认值为`None`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--model_id_or_path`: 默认值为`None`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 推荐使用model_type的方式指定.
 - `--model_revision`: 默认值为`None`. 具体的参数介绍可以在`sft.sh命令行参数`中查看. 如果`model_id_or_path`为None或者是本地的模型目录, 则该参数失效.
@@ -142,14 +144,23 @@ DPO参数继承了上面的SFT参数, 除此之外增加了以下参数:
 - `--save_safetensors`: 保存成`safetensors`文件还是`bin`文件. 默认为`True`.
 - `--overwrite_generation_config`: 是否将评估所使用的generation_config保存成`generation_config.json`文件, 默认为`None`. 如果指定了`ckpt_dir`, 则设置为`True`, 否则设置为`False`. 训练时保存的generation_config文件将被覆盖.
 - `--verbose`: 如果设置为False, 则使用tqdm样式推理. 如果设置为True, 则输出推理的query, response, label. 默认为`None`, 进行自动选择, 即`len(val_dataset) >= 100`时, 设置为False, 否则设置为True. 该参数只有在使用数据集评估时生效.
-- `--share`: 传递给gradio的`demo.queue().launch(...)`函数. 该参数只有在使用`app-ui`时才生效.
 - `--gpu_memory_utilization`: 初始化vllm引擎`EngineArgs`的参数, 默认为`0.9`. 该参数只有在使用vllm时才生效.
 - `--tensor_parallel_size`: 初始化vllm引擎`EngineArgs`的参数, 默认为`1`. 该参数只有在使用vllm时才生效.
 
 
-## deploy 命令行参数
+## app-ui 参数
+
+app-ui参数继承了infer参数, 除此之外增加了以下参数:
+
+- `server_name`: 默认为`'127.0.0.1'`. 传递给gradio的`demo.queue().launch(...)`函数.
+- `server_port`: 默认为`7860`. 传递给gradio的`demo.queue().launch(...)`函数.
+- `share`: 默认为`False`. 传递给gradio的`demo.queue().launch(...)`函数.
+
+## deploy 参数
+
+deploy参数继承了infer参数, 除此之外增加了以下参数:
+
 - `--host`: 默认为`'127.0.0.1`.
 - `--port`: 默认为`8000`.
 - `--ssl_keyfile`: 默认为`None`.
 - `--ssl_certfile`: 默认为`None`.
-- 其他参数继承自infer的命令行参数.
diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 51e0b33168..090af658de 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -55,6 +55,8 @@
 |yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||
 |deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
 |deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||
+|deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;||
+|deepseek-moe-16b-chat|[deepseek-ai/deepseek-moe-16b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2718;||
 |deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
 |deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||
 |openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||
@@ -64,10 +66,10 @@
 |openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|
 |openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||
 |mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|
-|mistral-7b-chat|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|
-|mistral-7b-chat-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|
-|mixtral-7b-moe|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|
-|mixtral-7b-moe-chat|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.36|
+|mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|
+|mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|
+|mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|
+|mixtral-moe-7b-instruct|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.36|
 |baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|
 |baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|
 |baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;|transformers<4.34|
@@ -104,11 +106,11 @@
 |tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|
 |codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|&#x2714;|&#x2714;||
 |deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
-|deepseek-coder-1_3b-chat|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
+|deepseek-coder-1_3b-instruct|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
 |deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
-|deepseek-coder-6_7b-chat|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
+|deepseek-coder-6_7b-instruct|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
 |deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
-|deepseek-coder-33b-chat|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
+|deepseek-coder-33b-instruct|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
 |phi2-3b|[AI-ModelScope/phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)|Wqkv|default-generation|&#x2714;|&#x2714;||
 |cogagent-chat|[ZhipuAI/cogagent-chat](https://modelscope.cn/models/ZhipuAI/cogagent-chat/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent|&#x2718;|&#x2718;||
 |cogagent-vqa|[ZhipuAI/cogagent-vqa](https://modelscope.cn/models/ZhipuAI/cogagent-vqa/summary)|vision_expert_query_key_value, vision_expert_dense, language_expert_query_key_value, language_expert_dense, query, key_value, dense|cogagent|&#x2718;|&#x2718;||
@@ -172,5 +174,8 @@
 |ner-jave-zh|[damo/zh_ner-JAVE](https://modelscope.cn/datasets/damo/zh_ner-JAVE/summary)|1266|0|118.3±45.5, min=44, max=223|chat, ner|
 |coco-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|414113|40504|298.8±2.8, min=294, max=351|chat, multi-modal, vision|
 |🔥coco-mini-en|[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)|20000|200|298.8±2.8, min=294, max=339|chat, multi-modal, vision|
+|capcha-images|[AI-ModelScope/captcha-images](https://modelscope.cn/datasets/AI-ModelScope/captcha-images/summary)|6000|2000|29.0±0.0, min=29, max=29|chat, multi-modal, vision|
 |aishell1-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|134424|7176|152.2±36.8, min=63, max=419|chat, multi-modal, audio|
 |🔥aishell1-mini-zh|[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets/summary)|14326|200|152.0±35.5, min=74, max=359|chat, multi-modal, audio|
+|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)|4483004|0|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|
+|hh-rlhf|[AI-ModelScope/hh-rlhf](https://modelscope.cn/datasets/AI-ModelScope/hh-rlhf/summary)|42537|2312|163.4±117.7, min=27, max=964|hfrl, dpo, pairwise|
diff --git "a/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md"
index fe30db7974..008bdc1299 100644
--- "a/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -260,14 +260,14 @@ CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'qwen-7b-chat/vx-xxx/checkpoint-xx
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-from swift.llm import InferArguments, merge_lora_main, app_ui_main
+from swift.llm import AppUIArguments, merge_lora_main, app_ui_main
 
 best_model_checkpoint = 'qwen-7b-chat/vx-xxx/checkpoint-xxx'
-infer_args = InferArguments(
+app_ui_args = AppUIArguments(
     ckpt_dir=best_model_checkpoint,
     eval_human=True)
-# merge_lora_main(infer_args)
-result = app_ui_main(infer_args)
+# merge_lora_main(app_ui_args)
+result = app_ui_main(app_ui_args)
 ```
 
 使用CLI:
diff --git a/examples/pytorch/llm/app.py b/examples/pytorch/llm/app.py
index c9a2083033..9a746ac7d4 100644
--- a/examples/pytorch/llm/app.py
+++ b/examples/pytorch/llm/app.py
@@ -3,14 +3,14 @@
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 import custom
 
-from swift.llm import InferArguments, ModelType, app_ui_main
+from swift.llm import AppUIArguments, ModelType, app_ui_main
 
 if __name__ == '__main__':
     # Please refer to the `infer.sh` for setting the parameters.
     # text-generation
-    # args = InferArguments(model_type=ModelType.chatglm3_6b_base)
+    # args = AppUIArguments(model_type=ModelType.chatglm3_6b_base)
     # or chat
-    args = InferArguments(model_type=ModelType.qwen_7b_chat_int4)
+    args = AppUIArguments(model_type=ModelType.qwen_7b_chat_int4)
     # or load from ckpt dir
-    # args = InferArguments(ckpt_dir='xxx/vx_xxx/checkpoint-xxx')
+    # args = AppUIArguments(ckpt_dir='xxx/vx_xxx/checkpoint-xxx')
     app_ui_main(args)
diff --git a/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/infer.sh
new file mode 100644
index 0000000000..146337506a
--- /dev/null
+++ b/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/infer.sh
@@ -0,0 +1,12 @@
+# Experimental environment: A100
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --ckpt_dir "output/deepseek-moe-16b-chat/vx_xxx/checkpoint-xxx" \
+    --load_dataset_config true \
+    --max_length 4096 \
+    --use_flash_attn true \
+    --max_new_tokens 2048 \
+    --temperature 0.1 \
+    --top_p 0.7 \
+    --repetition_penalty 1.05 \
+    --do_sample true \
diff --git a/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/sft.sh
new file mode 100644
index 0000000000..3a0828cb33
--- /dev/null
+++ b/examples/pytorch/llm/scripts/deepseek_moe_16b_chat/lora/sft.sh
@@ -0,0 +1,12 @@
+# Experimental environment: A100
+# 52GB GPU memory
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_type deepseek-moe-16b-chat \
+    --dataset damo-agent-mini-zh \
+    --train_dataset_sample 20000 \
+    --max_length 4096 \
+    --gradient_checkpointing true \
+    --eval_steps 100 \
+    --use_flash_attn true \
+    --output_dir output \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/infer.sh
similarity index 85%
rename from examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh
rename to examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/infer.sh
index ebbd08ee99..e8e1c112b4 100644
--- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/infer.sh
@@ -3,7 +3,7 @@
 PYTHONPATH=../../.. \
 CUDA_VISIBLE_DEVICES=0 \
 python llm_infer.py \
-    --ckpt_dir "output/mistral-7b-chat/vx_xxx/checkpoint-xxx" \
+    --ckpt_dir "output/mistral-7b-instruct/vx_xxx/checkpoint-xxx" \
     --load_dataset_config true \
     --max_length 4096 \
     --max_new_tokens 2048 \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/sft.sh b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/sft.sh
similarity index 96%
rename from examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/sft.sh
rename to examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/sft.sh
index 63590f83d7..363fc92d7d 100644
--- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/sft.sh
+++ b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_ddp_ds/sft.sh
@@ -37,7 +37,7 @@ torchrun \
     --save_total_limit 2 \
     --logging_steps 10 \
     --push_to_hub false \
-    --hub_model_id mistral-7b-chat-lora \
+    --hub_model_id mistral-7b-instruct-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
     --deepspeed_config_path 'ds_config/zero2.json' \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/infer.sh
similarity index 85%
rename from examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh
rename to examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/infer.sh
index a3c6932be7..0ac3874235 100644
--- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/infer.sh
@@ -3,7 +3,7 @@
 PYTHONPATH=../../.. \
 CUDA_VISIBLE_DEVICES=0 \
 python llm_infer.py \
-    --ckpt_dir "output/mistral-7b-chat/vx_xxx/checkpoint-xxx" \
+    --ckpt_dir "output/mistral-7b-instruct/vx_xxx/checkpoint-xxx" \
     --load_dataset_config true \
     --max_length 4096 \
     --max_new_tokens 2048 \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/sft.sh
similarity index 96%
rename from examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/sft.sh
rename to examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/sft.sh
index cd0dfbd6cd..77e1d83562 100644
--- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/mistral_7b_instruct/lora_mp_ddp/sft.sh
@@ -37,6 +37,6 @@ torchrun \
     --save_total_limit 2 \
     --logging_steps 10 \
     --push_to_hub false \
-    --hub_model_id mistral-7b-chat-lora \
+    --hub_model_id mistral-7b-instruct-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/mixtral_7b_moe/lora/infer.sh b/examples/pytorch/llm/scripts/mixtral_moe_7b/lora/infer.sh
similarity index 85%
rename from examples/pytorch/llm/scripts/mixtral_7b_moe/lora/infer.sh
rename to examples/pytorch/llm/scripts/mixtral_moe_7b/lora/infer.sh
index b8299c8e77..778db911ee 100644
--- a/examples/pytorch/llm/scripts/mixtral_7b_moe/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/mixtral_moe_7b/lora/infer.sh
@@ -3,7 +3,7 @@
 PYTHONPATH=../../.. \
 CUDA_VISIBLE_DEVICES=0,1 \
 python llm_infer.py \
-    --ckpt_dir "output/mixtral-7b-moe/vx_xxx/checkpoint-xxx" \
+    --ckpt_dir "output/mixtral-moe-7b/vx_xxx/checkpoint-xxx" \
     --load_dataset_config true \
     --max_length 2048 \
     --use_flash_attn true \
diff --git a/examples/pytorch/llm/scripts/mixtral_7b_moe/lora/sft.sh b/examples/pytorch/llm/scripts/mixtral_moe_7b/lora/sft.sh
similarity index 100%
rename from examples/pytorch/llm/scripts/mixtral_7b_moe/lora/sft.sh
rename to examples/pytorch/llm/scripts/mixtral_moe_7b/lora/sft.sh
diff --git a/examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/infer.sh b/examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/infer.sh
similarity index 83%
rename from examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/infer.sh
rename to examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/infer.sh
index c30c0aaf6b..40e0e35233 100644
--- a/examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/infer.sh
@@ -3,7 +3,7 @@
 PYTHONPATH=../../.. \
 CUDA_VISIBLE_DEVICES=0,1 \
 python llm_infer.py \
-    --ckpt_dir "output/mixtral-7b-moe-chat/vx_xxx/checkpoint-xxx" \
+    --ckpt_dir "output/mixtral-moe-7b-instruct/vx_xxx/checkpoint-xxx" \
     --load_dataset_config true \
     --max_length 2048 \
     --use_flash_attn true \
diff --git a/examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/sft.sh b/examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/sft.sh
similarity index 100%
rename from examples/pytorch/llm/scripts/mixtral_7b_moe_chat/lora/sft.sh
rename to examples/pytorch/llm/scripts/mixtral_moe_7b_instruct/lora/sft.sh
diff --git a/swift/llm/app_ui.py b/swift/llm/app_ui.py
index b3896d338a..eafcc831a9 100644
--- a/swift/llm/app_ui.py
+++ b/swift/llm/app_ui.py
@@ -3,7 +3,7 @@
 
 from swift.utils import get_main
 from .infer import merge_lora, prepare_model_template
-from .utils import (History, InferArguments, inference_stream,
+from .utils import (AppUIArguments, History, inference_stream,
                     limit_history_length)
 
 
@@ -11,7 +11,7 @@ def clear_session() -> History:
     return []
 
 
-def gradio_generation_demo(args: InferArguments) -> None:
+def gradio_generation_demo(args: AppUIArguments) -> None:
     import gradio as gr
     if args.infer_backend == 'vllm':
         from swift.llm import prepare_vllm_engine_template, inference_stream_vllm, inference_vllm
@@ -43,10 +43,18 @@ def model_generation(query: str) -> str:
                 output_box = gr.Textbox(lines=16, label='Output', max_lines=16)
         send = gr.Button('🚀 发送')
         send.click(model_generation, inputs=[input_box], outputs=[output_box])
-    demo.queue().launch(height=1000, share=args.share)
-
-
-def gradio_chat_demo(args: InferArguments) -> None:
+    # Compatible with InferArguments
+    share = getattr(args, 'share', False)
+    server_name = getattr(args, 'server_name', '127.0.0.1')
+    server_port = getattr(args, 'server_port', 7860)
+    demo.queue().launch(
+        height=1000,
+        share=share,
+        server_name=server_name,
+        server_port=server_port)
+
+
+def gradio_chat_demo(args: AppUIArguments) -> None:
     import gradio as gr
     if args.infer_backend == 'vllm':
         from swift.llm import prepare_vllm_engine_template, inference_stream_vllm
@@ -86,10 +94,18 @@ def model_chat(query: str, history: History) -> Tuple[str, History]:
             model_chat, inputs=[message, chatbot], outputs=[message, chatbot])
         clear_history.click(
             fn=clear_session, inputs=[], outputs=[chatbot], queue=False)
-    demo.queue().launch(height=1000, share=args.share)
-
-
-def llm_app_ui(args: InferArguments) -> None:
+    # Compatible with InferArguments
+    share = getattr(args, 'share', False)
+    server_name = getattr(args, 'server_name', '127.0.0.1')
+    server_port = getattr(args, 'server_port', 7860)
+    demo.queue().launch(
+        height=1000,
+        share=share,
+        server_name=server_name,
+        server_port=server_port)
+
+
+def llm_app_ui(args: AppUIArguments) -> None:
     args.eval_human = True
     if args.merge_lora_and_save:
         merge_lora(args, device_map='cpu')
@@ -99,4 +115,4 @@ def llm_app_ui(args: InferArguments) -> None:
         gradio_chat_demo(args)
 
 
-app_ui_main = get_main(InferArguments, llm_app_ui)
+app_ui_main = get_main(AppUIArguments, llm_app_ui)
diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py
index 2726797467..5470565ad2 100644
--- a/swift/llm/utils/__init__.py
+++ b/swift/llm/utils/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .argument import (DeployArguments, DPOArguments, InferArguments,
-                       RomeArguments, SftArguments, is_lora)
+from .argument import (AppUIArguments, DeployArguments, DPOArguments,
+                       InferArguments, RomeArguments, SftArguments, is_lora)
 from .client_utils import get_model_list_client, inference_client
 from .dataset import (DATASET_MAPPING, DatasetName, GetDatasetFunction,
                       HfDataset, add_self_cognition_dataset, get_dataset,
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 9fcb521c4f..896b94b06e 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -165,6 +165,9 @@ class SftArguments:
     top_p: float = 0.7
     repetition_penalty: float = 1.05
     num_beams: int = 1
+    # compatibility hf
+    per_device_train_batch_size: Optional[int] = None
+    per_device_eval_batch_size: Optional[int] = None
     # compatibility. (Deprecated)
     only_save_model: Optional[bool] = None
 
@@ -307,8 +310,9 @@ def __post_init__(self) -> None:
             'support_gradient_checkpointing', True)
         if self.gradient_checkpointing is None:
             self.gradient_checkpointing = support_gradient_checkpointing
-        elif not support_gradient_checkpointing:
-            assert self.gradient_checkpointing is False, f'{self.model_type} not support gradient_checkpointing.'
+        elif not support_gradient_checkpointing and self.gradient_checkpointing is True:
+            logger.warning(
+                f'{self.model_type} not support gradient_checkpointing.')
 
 
 @dataclass
@@ -374,8 +378,6 @@ class InferArguments:
     save_safetensors: bool = True
     overwrite_generation_config: Optional[bool] = None
     verbose: Optional[bool] = None
-    # app-ui
-    share: bool = False
     # vllm
     gpu_memory_utilization: float = 0.9
     tensor_parallel_size: int = 1
@@ -473,6 +475,13 @@ def check_ckpt_dir_correct(ckpt_dir) -> bool:
         return os.path.isfile(os.path.join(ckpt_dir, 'configuration.json'))
 
 
+@dataclass
+class AppUIArguments(InferArguments):
+    server_name: str = '127.0.0.1'
+    server_port: int = 7860
+    share: bool = False
+
+
 @dataclass
 class DeployArguments(InferArguments):
     host: str = '127.0.0.1'
@@ -603,17 +612,21 @@ def handle_compatibility(args: Union[SftArguments, InferArguments]) -> None:
         args.template_type = 'chatglm-generation'
     if args.template_type == 'chatml':
         args.template_type = TemplateType.qwen
-    if (isinstance(args, InferArguments) and args.show_dataset_sample != 10
-            and args.val_dataset_sample == 10):
-        # args.val_dataset_sample is the default value and args.show_dataset_sample is not the default value.
-        args.val_dataset_sample = args.show_dataset_sample
     if args.truncation_strategy == 'ignore':
         args.truncation_strategy = 'delete'
-    if isinstance(args,
-                  InferArguments) and args.safe_serialization is not None:
-        args.save_safetensors = args.safe_serialization
-    if isinstance(args, SftArguments) and args.only_save_model is not None:
-        args.save_only_model = args.only_save_model
+    if isinstance(args, InferArguments):
+        if args.show_dataset_sample != 10 and args.val_dataset_sample == 10:
+            # args.val_dataset_sample is the default value and args.show_dataset_sample is not the default value.
+            args.val_dataset_sample = args.show_dataset_sample
+        if args.safe_serialization is not None:
+            args.save_safetensors = args.safe_serialization
+    if isinstance(args, SftArguments):
+        if args.only_save_model is not None:
+            args.save_only_model = args.only_save_model
+        if args.per_device_train_batch_size is not None:
+            args.batch_size = args.per_device_train_batch_size
+        if args.per_device_eval_batch_size is not None:
+            args.eval_batch_size = args.per_device_eval_batch_size
 
 
 def set_model_type(args: Union[SftArguments, InferArguments]) -> None:
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index ac93b279cb..3c8769f209 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -581,7 +581,7 @@ def reorganize_row_simple(sample) -> Dict[str, str]:
     [('harmless-base', 'test')],
     process_hh_rlhf,
     get_dataset_from_repo,
-    tags=['hfrl', 'dpo', 'pairwise'])
+    tags=['hfrl', 'dpo', 'pairwise', '🔥'])
 
 register_dataset(
     DatasetName.medical_zh,
@@ -668,7 +668,7 @@ def add_system(row):
     [('default', 'validation')],
     _preprocess_capcha_images,
     get_dataset_from_repo,
-    tags=['chat', 'multi-modal', 'vision', '🔥'])
+    tags=['chat', 'multi-modal', 'vision'])
 
 register_dataset(
     DatasetName.cls_fudan_news_zh,
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 2feba9832f..5e8b81d8d9 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -81,6 +81,8 @@ class ModelType:
     # deepseek
     deepseek_7b = 'deepseek-7b'
     deepseek_7b_chat = 'deepseek-7b-chat'
+    deepseek_moe_16b = 'deepseek-moe-16b'
+    deepseek_moe_16b_chat = 'deepseek-moe-16b-chat'
     deepseek_67b = 'deepseek-67b'
     deepseek_67b_chat = 'deepseek-67b-chat'
     # openbuddy
@@ -92,10 +94,10 @@ class ModelType:
     openbuddy_deepseek_67b_chat = 'openbuddy-deepseek-67b-chat'
     # mistral
     mistral_7b = 'mistral-7b'
-    mistral_7b_chat = 'mistral-7b-chat'
-    mistral_7b_chat_v2 = 'mistral-7b-chat-v2'
-    mixtral_7b_moe = 'mixtral-7b-moe'
-    mixtral_7b_moe_chat = 'mixtral-7b-moe-chat'
+    mistral_7b_instruct = 'mistral-7b-instruct'
+    mistral_7b_instruct_v2 = 'mistral-7b-instruct-v2'
+    mixtral_moe_7b = 'mixtral-moe-7b'
+    mixtral_moe_7b_instruct = 'mixtral-moe-7b-instruct'
     # baichuan
     baichuan_7b = 'baichuan-7b'
     baichuan_13b = 'baichuan-13b'
@@ -147,11 +149,11 @@ class ModelType:
     codefuse_codellama_34b_chat = 'codefuse-codellama-34b-chat'
     # deepseek-coder
     deepseek_coder_1_3b = 'deepseek-coder-1_3b'
-    deepseek_coder_1_3b_chat = 'deepseek-coder-1_3b-chat'
+    deepseek_coder_1_3b_instruct = 'deepseek-coder-1_3b-instruct'
     deepseek_coder_6_7b = 'deepseek-coder-6_7b'
-    deepseek_coder_6_7b_chat = 'deepseek-coder-6_7b-chat'
+    deepseek_coder_6_7b_instruct = 'deepseek-coder-6_7b-instruct'
     deepseek_coder_33b = 'deepseek-coder-33b'
-    deepseek_coder_33b_chat = 'deepseek-coder-33b-chat'
+    deepseek_coder_33b_instruct = 'deepseek-coder-33b-instruct'
     # phi
     phi2_3b = 'phi2-3b'
 
@@ -631,7 +633,7 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True)
 @register_model(
-    ModelType.deepseek_coder_1_3b_chat,
+    ModelType.deepseek_coder_1_3b_instruct,
     'deepseek-ai/deepseek-coder-1.3b-instruct',
     LoRATM.llama2,
     TemplateType.deepseek_coder,
@@ -639,7 +641,7 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True)
 @register_model(
-    ModelType.deepseek_coder_6_7b_chat,
+    ModelType.deepseek_coder_6_7b_instruct,
     'deepseek-ai/deepseek-coder-6.7b-instruct',
     LoRATM.llama2,
     TemplateType.deepseek_coder,
@@ -647,7 +649,7 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True)
 @register_model(
-    ModelType.deepseek_coder_33b_chat,
+    ModelType.deepseek_coder_33b_instruct,
     'deepseek-ai/deepseek-coder-33b-instruct',
     LoRATM.llama2,
     TemplateType.deepseek_coder,
@@ -800,7 +802,7 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True)
 @register_model(
-    ModelType.mistral_7b_chat,
+    ModelType.mistral_7b_instruct,
     'AI-ModelScope/Mistral-7B-Instruct-v0.1',
     LoRATM.llama2,
     TemplateType.llama,
@@ -808,7 +810,7 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True)
 @register_model(
-    ModelType.mistral_7b_chat_v2,
+    ModelType.mistral_7b_instruct_v2,
     'AI-ModelScope/Mistral-7B-Instruct-v0.2',
     LoRATM.llama2,
     TemplateType.llama,
@@ -824,7 +826,7 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True)
 @register_model(
-    ModelType.mixtral_7b_moe,
+    ModelType.mixtral_moe_7b,
     'AI-ModelScope/Mixtral-8x7B-v0.1',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
@@ -833,7 +835,7 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_vllm=True,
     support_gradient_checkpointing=False)
 @register_model(
-    ModelType.mixtral_7b_moe_chat,
+    ModelType.mixtral_moe_7b_instruct,
     'AI-ModelScope/Mixtral-8x7B-Instruct-v0.1',
     LoRATM.llama2,
     TemplateType.llama,
@@ -1371,6 +1373,44 @@ def get_model_tokenizer_phi(model_dir: str,
                                          load_model, model_config, **kwargs)
 
 
+@register_model(
+    ModelType.deepseek_moe_16b_chat,
+    'deepseek-ai/deepseek-moe-16b-chat',
+    LoRATM.llama2,
+    TemplateType.deepseek,
+    support_flash_attn=True)
+@register_model(
+    ModelType.deepseek_moe_16b,
+    'deepseek-ai/deepseek-moe-16b-base',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    support_flash_attn=True)
+def get_model_tokenizer_deepseek_moe(model_dir: str,
+                                     torch_dtype: Dtype,
+                                     model_kwargs: Dict[str, Any],
+                                     load_model: bool = True,
+                                     **kwargs):
+    model, tokenizer = get_model_tokenizer_with_flash_attn(
+        model_dir, torch_dtype, model_kwargs, load_model, **kwargs)
+    if model is not None:
+        # fix dtype bug
+        mlp_cls = model.model.layers[1].mlp.__class__
+        if not hasattr(mlp_cls, '__old_forward'):  # Avoid double patching
+            __old_forward = mlp_cls._old_forward if hasattr(
+                mlp_cls, '_old_forward') else mlp_cls.forward
+
+            def _new_forward(self, hidden_states) -> Tensor:
+                dtype = hidden_states.dtype
+                return __old_forward(self, hidden_states).to(dtype)
+
+            if hasattr(mlp_cls, '_old_forward'):  # device_map
+                mlp_cls._old_forward = _new_forward
+            else:
+                mlp_cls.forward = _new_forward
+            mlp_cls.__old_forward = __old_forward
+    return model, tokenizer
+
+
 def fix_transformers_upgrade(module: PreTrainedModel) -> None:
     # from 4.35, transformers changes its arguments of _set_gradient_checkpointing
     if version.parse(transformers.__version__) >= version.parse('4.35'):
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index c4dfeac5c4..14e5914442 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -32,8 +32,8 @@ class TemplateType:
     zephyr = 'zephyr'
     sus = 'sus'
     deepseek = 'deepseek'
-    codefuse_codellama = 'codefuse-codellama'
     deepseek_coder = 'deepseek-coder'
+    codefuse_codellama = 'codefuse-codellama'
     cogagent = 'cogagent'
     # compatibility. (Deprecated)
     chatml = 'chatml'